diff options
Diffstat (limited to 'mm')
74 files changed, 2476 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa072f42..989f8f3d77e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y @@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT - default !ZONE_DMA - depends on !ZONE_DMA depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP depends on X86_64 #arch_add_memory() comprehends device memory help @@ -669,3 +667,8 @@ config ZONE_DEVICE config FRAME_VECTOR bool + +config ARCH_USES_HIGH_VMA_FLAGS + bool +config ARCH_HAS_PKEYS + bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5c50b238b770..22f4cd96acb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO Enabling page poisoning with this option will disable hibernation If unsure, say N + bool + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + ---help--- + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. diff --git a/mm/Makefile b/mm/Makefile index cfdd481d27a5..deb467edca2d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,8 +3,24 @@ # KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -81,3 +97,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c554d173a65f..0c6317b7db38 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -898,7 +898,7 @@ static atomic_t nr_wb_congested[2]; void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { wait_queue_head_t *wqh = &congestion_wqh[sync]; - enum wb_state bit; + enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (test_and_clear_bit(bit, &congested->state)) @@ -911,7 +911,7 @@ EXPORT_SYMBOL(clear_wb_congested); void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum wb_state bit; + enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (!test_and_set_bit(bit, &congested->state)) @@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, if (copy_to_user(buffer, kbuf, sizeof(kbuf))) return -EFAULT; - printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", - table->procname); + pr_warn_once("%s exported in /proc is scheduled for removal\n", + table->procname); *lenp = 2; *ppos += *lenp; diff --git a/mm/bootmem.c b/mm/bootmem.c index 91e32bc8517f..0aa7dda52402 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup); #define bdebug(fmt, args...) ({ \ if (unlikely(bootmem_debug)) \ - printk(KERN_INFO \ - "bootmem::%s " fmt, \ + pr_info("bootmem::%s " fmt, \ __func__, ## args); \ }) @@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/compaction.c b/mm/compaction.c index 93f71d968098..8fa254043801 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> */ +#include <linux/cpu.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/compaction.h> @@ -17,6 +18,8 @@ #include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -849,16 +852,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, ISOLATE_UNEVICTABLE); - /* - * In case of fatal failure, release everything that might - * have been isolated in the previous iteration, and signal - * the failure back to caller. - */ - if (!pfn) { - putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; + if (!pfn) break; - } if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; @@ -1188,11 +1183,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1335,10 +1330,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1474,6 +1468,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1736,4 +1731,225 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0 || kthread_should_stop(); +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + if (kthread_should_stop()) + return; + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/debug.c b/mm/debug.c index df7247b0b532..8865bfb41b0b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, atomic_read(&page->_count), page_mapcount(page), + page, page_ref_count(page), page_mapcount(page), page->mapping, page->index); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 000000000000..1aef3d562e52 --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,54 @@ +#include <linux/mm_types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/page_ref.h> + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); diff --git a/mm/dmapool.c b/mm/dmapool.c index 57312b5d6e12..abcbfe86c25a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool) "dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); else - printk(KERN_ERR - "dma_pool_destroy %s, %p busy\n", + pr_err("dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); else - printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); return; } @@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); else - printk(KERN_ERR - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); return; } @@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/fadvise.c b/mm/fadvise.c index b8a5bc66b0c0..b8024fa7101d 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -97,8 +97,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) break; case POSIX_FADV_WILLNEED: /* First and last PARTIAL page! */ - start_index = offset >> PAGE_CACHE_SHIFT; - end_index = endbyte >> PAGE_CACHE_SHIFT; + start_index = offset >> PAGE_SHIFT; + end_index = endbyte >> PAGE_SHIFT; /* Careful about overflow on the "+1" */ nrpages = end_index - start_index + 1; @@ -124,8 +124,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) * preserved on the expectation that it is better to preserve * needed memory than to discard unneeded memory. */ - start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; - end_index = (endbyte >> PAGE_CACHE_SHIFT); + start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; + end_index = (endbyte >> PAGE_SHIFT); if (end_index >= start_index) { unsigned long count = invalidate_mapping_pages(mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 61b441b191ad..f2479af09da9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -265,7 +265,7 @@ void delete_from_page_cache(struct page *page) if (freepage) freepage(page); - page_cache_release(page); + put_page(page); } EXPORT_SYMBOL(delete_from_page_cache); @@ -352,8 +352,8 @@ EXPORT_SYMBOL(filemap_flush); static int __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; - pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; + pgoff_t index = start_byte >> PAGE_SHIFT; + pgoff_t end = end_byte >> PAGE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; @@ -550,7 +550,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; - page_cache_get(new); + get_page(new); new->mapping = mapping; new->index = offset; @@ -572,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) radix_tree_preload_end(); if (freepage) freepage(old); - page_cache_release(old); + put_page(old); } return error; @@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; @@ -651,7 +651,7 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - page_cache_get(page); + get_page(page); page->mapping = mapping; page->index = offset; @@ -675,7 +675,7 @@ err_insert: spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); return error; } @@ -1083,7 +1083,7 @@ repeat: * include/linux/pagemap.h for details. */ if (unlikely(page != *pagep)) { - page_cache_release(page); + put_page(page); goto repeat; } } @@ -1121,7 +1121,7 @@ repeat: /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } VM_BUG_ON_PAGE(page->index != offset, page); @@ -1168,7 +1168,7 @@ repeat: if (fgp_flags & FGP_LOCK) { if (fgp_flags & FGP_NOWAIT) { if (!trylock_page(page)) { - page_cache_release(page); + put_page(page); return NULL; } } else { @@ -1178,7 +1178,7 @@ repeat: /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } VM_BUG_ON_PAGE(page->index != offset, page); @@ -1209,7 +1209,7 @@ no_page: err = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_RECLAIM_MASK); if (unlikely(err)) { - page_cache_release(page); + put_page(page); page = NULL; if (err == -EEXIST) goto repeat; @@ -1255,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1263,8 +1262,10 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it @@ -1277,7 +1278,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } export: @@ -1317,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1327,13 +1327,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - WARN_ON(iter.index); - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1348,7 +1343,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1384,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: @@ -1395,12 +1389,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1415,7 +1405,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1425,7 +1415,7 @@ repeat: * negatives, which is just confusing to the caller. */ if (page->mapping == NULL || page->index != iter.index) { - page_cache_release(page); + put_page(page); break; } @@ -1460,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; @@ -1471,12 +1460,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page. @@ -1497,7 +1482,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -1539,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { struct page *page; @@ -1549,12 +1533,8 @@ repeat: continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* @@ -1569,7 +1549,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } export: @@ -1630,11 +1610,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, unsigned int prev_offset; int error = 0; - index = *ppos >> PAGE_CACHE_SHIFT; - prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_SHIFT; + prev_index = ra->prev_pos >> PAGE_SHIFT; + prev_offset = ra->prev_pos & (PAGE_SIZE-1); + last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; for (;;) { struct page *page; @@ -1668,7 +1648,7 @@ find_page: if (PageUptodate(page)) goto page_ok; - if (inode->i_blkbits == PAGE_CACHE_SHIFT || + if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; if (!trylock_page(page)) @@ -1692,18 +1672,18 @@ page_ok: */ isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + put_page(page); goto out; } /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_CACHE_SIZE; + nr = PAGE_SIZE; if (index == end_index) { - nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + nr = ((isize - 1) & ~PAGE_MASK) + 1; if (nr <= offset) { - page_cache_release(page); + put_page(page); goto out; } } @@ -1731,11 +1711,11 @@ page_ok: ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; prev_offset = offset; - page_cache_release(page); + put_page(page); written += ret; if (!iov_iter_count(iter)) goto out; @@ -1755,7 +1735,7 @@ page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); continue; } @@ -1777,7 +1757,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { - page_cache_release(page); + put_page(page); error = 0; goto find_page; } @@ -1794,7 +1774,7 @@ readpage: * invalidate_mapping_pages got it */ unlock_page(page); - page_cache_release(page); + put_page(page); goto find_page; } unlock_page(page); @@ -1809,7 +1789,7 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - page_cache_release(page); + put_page(page); goto out; no_cached_page: @@ -1825,7 +1805,7 @@ no_cached_page: error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { - page_cache_release(page); + put_page(page); if (error == -EEXIST) { error = 0; goto find_page; @@ -1837,10 +1817,10 @@ no_cached_page: out: ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_CACHE_SHIFT; + ra->prev_pos <<= PAGE_SHIFT; ra->prev_pos |= prev_offset; - *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; + *ppos = ((loff_t)index << PAGE_SHIFT) + offset; file_accessed(filp); return written ? written : error; } @@ -1860,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + + if (!count) + goto out; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); loff_t size; - if (!count) - goto out; /* skip atime */ size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, pos, pos + count - 1); @@ -1931,7 +1912,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) else if (ret == -EEXIST) ret = 0; /* losing race to add is OK */ - page_cache_release(page); + put_page(page); } while (ret == AOP_TRUNCATED_PAGE); @@ -2041,8 +2022,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); - if (offset >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(inode), PAGE_SIZE); + if (offset >= size >> PAGE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -2068,7 +2049,7 @@ retry_find: } if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - page_cache_release(page); + put_page(page); return ret | VM_FAULT_RETRY; } @@ -2091,10 +2072,10 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); - if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { + size = round_up(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= size >> PAGE_SHIFT)) { unlock_page(page); - page_cache_release(page); + put_page(page); return VM_FAULT_SIGBUS; } @@ -2139,7 +2120,7 @@ page_not_uptodate: if (!PageUptodate(page)) error = -EIO; } - page_cache_release(page); + put_page(page); if (!error || error == AOP_TRUNCATED_PAGE) goto retry_find; @@ -2171,10 +2152,11 @@ repeat: if (unlikely(!page)) goto next; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - break; - else - goto next; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + goto next; } if (!page_cache_get_speculative(page)) @@ -2182,7 +2164,7 @@ repeat: /* Has the page moved? */ if (unlikely(page != *slot)) { - page_cache_release(page); + put_page(page); goto repeat; } @@ -2196,8 +2178,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); - if (page->index >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= size >> PAGE_SHIFT) goto unlock; pte = vmf->pte + page->index - vmf->pgoff; @@ -2213,7 +2195,7 @@ repeat: unlock: unlock_page(page); skip: - page_cache_release(page); + put_page(page); next: if (iter.index == vmf->max_pgoff) break; @@ -2296,7 +2278,7 @@ static struct page *wait_on_page_read(struct page *page) if (!IS_ERR(page)) { wait_on_page_locked(page); if (!PageUptodate(page)) { - page_cache_release(page); + put_page(page); page = ERR_PTR(-EIO); } } @@ -2319,7 +2301,7 @@ repeat: return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); if (unlikely(err)) { - page_cache_release(page); + put_page(page); if (err == -EEXIST) goto repeat; /* Presumably ENOMEM for radix tree node */ @@ -2329,7 +2311,7 @@ repeat: filler: err = filler(data, page); if (err < 0) { - page_cache_release(page); + put_page(page); return ERR_PTR(err); } @@ -2382,7 +2364,7 @@ filler: /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); + put_page(page); goto repeat; } @@ -2529,7 +2511,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) struct iov_iter data; write_len = iov_iter_count(from); - end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + end = (pos + write_len - 1) >> PAGE_SHIFT; written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); if (written) @@ -2543,7 +2525,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ if (mapping->nrpages) { written = invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); /* * If a page can not be invalidated, return 0 to fall back * to buffered write. @@ -2568,7 +2550,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, - pos >> PAGE_CACHE_SHIFT, end); + pos >> PAGE_SHIFT, end); } if (written > 0) { @@ -2629,8 +2611,8 @@ ssize_t generic_perform_write(struct file *file, size_t copied; /* Bytes copied from user */ void *fsdata; - offset = (pos & (PAGE_CACHE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: @@ -2683,7 +2665,7 @@ again: * because not all segments in the iov can be copied at * once without a pagefault. */ - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_single_seg_count(i)); goto again; } @@ -2770,8 +2752,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = endbyte + 1; written += status; invalidate_mapping_pages(mapping, - pos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); } else { /* * We don't know how much we wrote, so just return diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 7cf2b7163222..381bb07ed14f 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(current, mm, start, nr_frames, + ret = get_user_pages_locked(start, nr_frames, write, force, (struct page **)(vec->ptrs), &locked); goto out; } @@ -14,6 +14,7 @@ #include <linux/rwsem.h> #include <linux/hugetlb.h> +#include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -363,6 +364,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) @@ -413,11 +416,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; + int write = (gup_flags & FOLL_WRITE); + int foreign = (gup_flags & FOLL_REMOTE); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_WRITE) { + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -443,6 +448,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } + /* + * gups are always data accesses, not instruction + * fetches, so execute=false here + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return -EFAULT; return 0; } @@ -609,6 +620,28 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); +bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +{ + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; + + if (!(vm_flags & vma->vm_flags)) + return false; + + /* + * The architecture might have a hardware protection + * mechanism other than read/write that can deny access. + * + * gup always represents data access, not instruction + * fetches, so execute=false here: + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return false; + + return true; +} + /* * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or @@ -644,7 +677,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_flags_t vm_flags; int ret, major = 0; if (unlocked) @@ -655,8 +687,7 @@ retry: if (!vma || address < vma->vm_start) return -EFAULT; - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) + if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); @@ -807,13 +838,13 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * if (locked) * up_read(&mm->mmap_sem); */ -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, locked, true, FOLL_TOUCH); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, NULL, locked, true, + FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_locked); @@ -860,17 +891,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * or if "force" shall be set to 1 (get_user_pages_fast misses the * "force" parameter). */ -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, FOLL_TOUCH); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages_unlocked); /* - * get_user_pages() - pin user pages in memory + * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm @@ -924,12 +954,30 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, FOLL_TOUCH); + pages, vmas, NULL, false, + FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's. We also + * obviously don't pass FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, vmas, NULL, false, + FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -1058,7 +1106,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) * @addr: user address * * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). + * to be freed afterwards by put_page(). * * Returns NULL on any kind of failure - a hole must then be inserted into * the corefile, to preserve alignment with its headers; and also returns @@ -1144,6 +1192,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; + if (!arch_pte_access_permitted(pte, write)) + goto pte_unmap; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = compound_head(page); @@ -1439,7 +1490,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; int nr, ret; start &= PAGE_MASK; @@ -1451,8 +1501,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(current, mm, start, - nr_pages - nr, write, 0, pages); + ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1ea21e203a70..b49ee126d4d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -78,12 +78,12 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); /* default scan 8*512 pte (or vmas) every 30 second */ -static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; +static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; @@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * it would have happened if the vma was large enough during page * fault. */ -static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; +static unsigned int khugepaged_max_ptes_none __read_mostly; static int khugepaged(void *none); static int khugepaged_slab_init(void); @@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu " - "to help transparent hugepage allocations\n", + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; @@ -233,7 +232,7 @@ retry: return READ_ONCE(huge_zero_page); } -static void put_huge_zero_page(void) +void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = { #ifdef CONFIG_SYSFS -static ssize_t double_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag req_madv) -{ - if (test_bit(enabled, &transparent_hugepage_flags)) { - VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); - return sprintf(buf, "[always] madvise never\n"); - } else if (test_bit(req_madv, &transparent_hugepage_flags)) - return sprintf(buf, "always [madvise] never\n"); - else - return sprintf(buf, "always madvise [never]\n"); -} -static ssize_t double_flag_store(struct kobject *kobj, +static ssize_t triple_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag deferred, enum transparent_hugepage_flag req_madv) { - if (!memcmp("always", buf, + if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + if (enabled == deferred) + return -EINVAL; + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + set_bit(deferred, &transparent_hugepage_flags); + } else if (!memcmp("always", buf, min(sizeof("always")-1, count))) { - set_bit(enabled, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); clear_bit(req_madv, &transparent_hugepage_flags); + set_bit(enabled, &transparent_hugepage_flags); } else if (!memcmp("madvise", buf, min(sizeof("madvise")-1, count))) { clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); set_bit(req_madv, &transparent_hugepage_flags); } else if (!memcmp("never", buf, min(sizeof("never")-1, count))) { clear_bit(enabled, &transparent_hugepage_flags); clear_bit(req_madv, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); } else return -EINVAL; @@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj, static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return double_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "[always] madvise never\n"); + else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); } + static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; - ret = double_flag_store(kobj, attr, buf, count, + ret = triple_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); @@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj, static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return double_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "[always] defer madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always [defer] madvise never\n"); + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [madvise] never\n"); + else + return sprintf(buf, "always defer madvise [never]\n"); + } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return double_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + return triple_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); } static struct kobj_attribute defrag_attr = @@ -660,6 +669,18 @@ static int __init hugepage_init(void) return -EINVAL; } + khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; + khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + /* + * hugepages can't be allocated by the buddy allocator + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); + /* + * we use page->mapping and page->index in second tail page + * as list_head: assuming THP order >= 2 + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; @@ -764,7 +785,6 @@ void prep_transhuge_page(struct page *page) * we use page->mapping and page->indexlru in second tail page * as list_head: assuming THP order >= 2 */ - BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); INIT_LIST_HEAD(page_deferred_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); @@ -843,9 +863,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return 0; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) +/* + * If THP is set to always then directly reclaim/compact as necessary + * If set to defer then do no reclaim and defer to khugepaged + * If set to madvise and the VMA is flagged then directly reclaim/compact + */ +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +{ + gfp_t reclaim_flags = 0; + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && + (vma->vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +960,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1257,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1279,7 +1314,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -1643,12 +1678,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - put_huge_zero_page(); + tlb_remove_page(tlb, pmd_page(orig_pmd)); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); @@ -1919,10 +1954,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2039,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -2249,11 +2284,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2300,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2310,8 +2346,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); - return true; + return !(vma->vm_flags & VM_NO_THP); } static void collapse_huge_page(struct mm_struct *mm, @@ -2335,8 +2370,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); @@ -2537,7 +2571,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } khugepaged_node_load[node]++; if (!PageLRU(page)) { - result = SCAN_SCAN_ABORT; + result = SCAN_PAGE_LRU; goto out_unmap; } if (PageLocked(page)) { @@ -2857,7 +2891,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); - atomic_add(HPAGE_PMD_NR - 1, &page->_count); + page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); @@ -2947,44 +2981,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; unsigned long haddr = address & HPAGE_PMD_MASK; mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + struct page *page = pmd_page(*pmd); if (PageMlocked(page)) - get_page(page); - else - page = NULL; + clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, false); + __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); - if (page) { - lock_page(page); - munlock_vma_page(page); - unlock_page(page); - put_page(page); - } } -static void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -2996,11 +3019,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + return; + /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_pmd(vma, pmd, address); + __split_huge_pmd(vma, pmd, address, freeze); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3016,7 +3048,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start); + split_huge_pmd_address(vma, start, false, NULL); /* * If the new end address isn't hpage aligned and it could @@ -3026,7 +3058,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end); + split_huge_pmd_address(vma, end, false, NULL); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3040,184 +3072,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart); + split_huge_pmd_address(next, nstart, false, NULL); } } -static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static void freeze_page(struct page *page) { - unsigned long haddr = address & HPAGE_PMD_MASK; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - pmd = pmd_offset(pud, address); - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_present(*pmd)) { - spin_unlock(ptl); - return; - } - if (pmd_trans_huge(*pmd)) { - if (page == pmd_page(*pmd)) - __split_huge_pmd_locked(vma, pmd, haddr, true); - spin_unlock(ptl); - return; - } - spin_unlock(ptl); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - pte_t entry, swp_pte; - swp_entry_t swp_entry; - - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non PMD-aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!pte_present(*pte)) - continue; - if (page_to_pfn(page) != pte_pfn(*pte)) - continue; - flush_cache_page(vma, address, page_to_pfn(page)); - entry = ptep_clear_flush(vma, address, pte); - if (pte_dirty(entry)) - SetPageDirty(page); - swp_entry = make_migration_entry(page, pte_write(entry)); - swp_pte = swp_entry_to_pte(swp_entry); - if (pte_soft_dirty(entry)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(vma->vm_mm, address, pte, swp_pte); - page_remove_rmap(page, false); - put_page(page); - } - pte_unmap_unlock(pte - 1, ptl); -} - -static void freeze_page(struct anon_vma *anon_vma, struct page *page) -{ - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); + enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | + TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, - pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); - - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - freeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } -} - -static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) -{ - spinlock_t *ptl; - pmd_t *pmd; - pte_t *pte, entry; - swp_entry_t swp_entry; - unsigned long haddr = address & HPAGE_PMD_MASK; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non-PMD aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!is_swap_pte(*pte)) - continue; - - swp_entry = pte_to_swp_entry(*pte); - if (!is_migration_entry(swp_entry)) - continue; - if (migration_entry_to_page(swp_entry) != page) - continue; - - get_page(page); - page_add_anon_rmap(page, vma, address, false); - - entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (PageDirty(page)) - entry = pte_mkdirty(entry); - if (is_write_migration_entry(swp_entry)) - entry = maybe_mkwrite(entry, vma); - - flush_dcache_page(page); - set_pte_at(vma->vm_mm, address, pte, entry); + /* We only need TTU_SPLIT_HUGE_PMD once */ + ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); + for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { + /* Cut short if the page is unmapped */ + if (page_count(page) == 1) + return; - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + ret = try_to_unmap(page + i, ttu_flags); } - pte_unmap_unlock(pte - 1, ptl); + VM_BUG_ON(ret); } -static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +static void unfreeze_page(struct page *page) { - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); - - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff, pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); + int i; - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - unfreeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); } static void __split_huge_page_tail(struct page *head, int tail, @@ -3226,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But @@ -3239,7 +3123,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc(). */ - atomic_inc(&page_tail->_count); + page_ref_inc(page_tail); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3295,7 +3179,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); - unfreeze_page(page_anon_vma(head), head); + unfreeze_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -3334,6 +3218,64 @@ int total_mapcount(struct page *page) } /* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + +/* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. * @@ -3391,7 +3333,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(anon_vma, head); + freeze_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -3420,7 +3362,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) BUG(); } else { spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - unfreeze_page(anon_vma, head); + unfreeze_page(head); ret = -EBUSY; } @@ -3455,6 +3397,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; } @@ -3562,7 +3505,7 @@ next: } } - pr_info("%lu of %lu THP split", split, total); + pr_info("%lu of %lu THP split\n", split, total); return 0; } @@ -3573,7 +3516,7 @@ static int __init split_huge_pages_debugfs(void) { void *ret; - ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, &split_huge_pages_fops); if (!ret) pr_warn("Failed to create split_huge_pages in debugfs"); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aefba5a9cc47..19d0d08b396f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warning("hugepagesz= specified twice, ignoring\n"); + pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warning("hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); return 1; } @@ -3347,7 +3346,7 @@ retry_avoidcopy: old_page != pagecache_page) outside_reserve = 1; - page_cache_get(old_page); + get_page(old_page); /* * Drop page table lock as buddy allocator may be called. It will @@ -3365,7 +3364,7 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { - page_cache_release(old_page); + put_page(old_page); BUG_ON(huge_pte_none(pte)); unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); @@ -3426,9 +3425,9 @@ retry_avoidcopy: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out_release_all: - page_cache_release(new_page); + put_page(new_page); out_release_old: - page_cache_release(old_page); + put_page(old_page); spin_lock(ptl); /* Caller expects lock to be held */ return ret; diff --git a/mm/internal.h b/mm/internal.h index ad9400d759c8..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,10 +38,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline void set_page_count(struct page *page, int v) -{ - atomic_set(&page->_count, v); -} +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, @@ -64,7 +64,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } @@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -175,6 +172,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ @@ -393,7 +391,7 @@ extern int mminit_loglevel; do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ - printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index a61460d9f5b0..131daadf40e4 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,5 +1,6 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ade8c91..38f1dd79acdb 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -17,7 +17,9 @@ #define DISABLE_BRANCH_PROFILING #include <linux/export.h> +#include <linux/interrupt.h> #include <linux/init.h> +#include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/linkage.h> @@ -32,7 +34,6 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <linux/kasan.h> #include "kasan.h" #include "../slab.h" @@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } +#ifdef CONFIG_SLAB +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static size_t optimal_redzone(size_t object_size) +{ + int rz = + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; + return rz; +} + +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags) +{ + int redzone_adjust; + /* Make sure the adjusted size is still less than + * KMALLOC_MAX_CACHE_SIZE. + * TODO: this check is only useful for SLAB, but not SLUB. We'll need + * to skip it for SLUB when it starts using kasan_cache_create(). + */ + if (*size > KMALLOC_MAX_CACHE_SIZE - + sizeof(struct kasan_alloc_meta) - + sizeof(struct kasan_free_meta)) + return; + *flags |= SLAB_KASAN; + /* Add alloc meta. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* Add free meta. */ + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } + redzone_adjust = optimal_redzone(cache->object_size) - + (*size - cache->object_size); + if (redzone_adjust > 0) + *size += redzone_adjust; + *size = min(KMALLOC_MAX_CACHE_SIZE, + max(*size, + cache->object_size + + optimal_redzone(cache->object_size))); +} +#endif + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_INIT; + } +#endif } -void kasan_slab_alloc(struct kmem_cache *cache, void *object) +#ifdef CONFIG_SLAB +static inline int in_irqentry_text(unsigned long ptr) { - kasan_kmalloc(cache, object, cache->object_size); + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = save_stack(flags); +} + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + return (void *)object + cache->kasan_info.free_meta_offset; +} +#endif + +void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) +{ + kasan_kmalloc(cache, object, cache->object_size, flags); } void kasan_slab_free(struct kmem_cache *cache, void *object) @@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_free_meta *free_info = + get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_FREE; + set_track(&free_info->track, GFP_NOWAIT); + } +#endif + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, + gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; @@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + + alloc_info->state = KASAN_STATE_ALLOC; + alloc_info->alloc_size = size; + set_track(&alloc_info->track, flags); + } +#endif } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size) +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) KASAN_PAGE_REDZONE); } -void kasan_krealloc(const void *object, size_t size) +void kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size); + kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size); + kasan_kmalloc(page->slab_cache, object, size, flags); } void kasan_kfree(void *ptr) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e5c21e..30a2f0ba0e09 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/stackdepot.h> #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) @@ -54,6 +55,42 @@ struct kasan_global { #endif }; +/** + * Structures to keep alloc and free tracks * + */ + +enum kasan_state { + KASAN_STATE_INIT, + KASAN_STATE_ALLOC, + KASAN_STATE_FREE +}; + +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + +struct kasan_alloc_meta { + struct kasan_track track; + u32 state : 2; /* enum kasan_state */ + u32 alloc_size : 30; + u32 reserved; +}; + +struct kasan_free_meta { + /* Allocator freelist pointer, unused by KASAN. */ + void **freelist; + struct kasan_track track; +}; + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object); + + static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 12f222d0224b..60869a5a0124 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -18,6 +18,7 @@ #include <linux/printk.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/stackdepot.h> #include <linux/stacktrace.h> #include <linux/string.h> #include <linux/types.h> @@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +#ifdef CONFIG_SLAB +static void print_track(struct kasan_track *track) +{ + pr_err("PID = %u\n", track->pid); + if (track->stack) { + struct stack_trace trace; + + depot_fetch_stack(track->stack, &trace); + print_stack_trace(&trace, 0); + } else { + pr_err("(stack is not available)\n"); + } +} + +static void object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + struct kasan_free_meta *free_info; + + dump_stack(); + pr_err("Object at %p, in cache %s\n", object, cache->name); + if (!(cache->flags & SLAB_KASAN)) + return; + switch (alloc_info->state) { + case KASAN_STATE_INIT: + pr_err("Object not allocated yet\n"); + break; + case KASAN_STATE_ALLOC: + pr_err("Object allocated with size %u bytes.\n", + alloc_info->alloc_size); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + break; + case KASAN_STATE_FREE: + pr_err("Object freed, allocated with size %u bytes\n", + alloc_info->alloc_size); + free_info = get_free_info(cache, object); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + pr_err("Deallocation:\n"); + print_track(&free_info->track); + break; + } +} +#endif + static void print_address_description(struct kasan_access_info *info) { const void *addr = info->access_addr; @@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info) if (PageSlab(page)) { void *object; struct kmem_cache *cache = page->slab_cache; - void *last_object; - - object = virt_to_obj(cache, page_address(page), addr); - last_object = page_address(page) + - page->objects * cache->size; - - if (unlikely(object > last_object)) - object = last_object; /* we hit into padding */ - + object = nearest_obj(cache, page, + (void *)info->access_addr); object_err(cache, page, object, - "kasan: bad access detected"); + "kasan: bad access detected"); return; } dump_page(page, "kasan: bad access detected"); @@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info) if (!init_task_stack_addr(addr)) pr_err("Address belongs to variable %pS\n", addr); } - dump_stack(); } @@ -214,8 +254,7 @@ static void kasan_report_error(struct kasan_access_info *info) */ kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); if (info->access_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { if ((unsigned long)info->access_addr < PAGE_SIZE) @@ -236,8 +275,7 @@ static void kasan_report_error(struct kasan_access_info *info) print_address_description(info); print_shadow_for_address(info->first_bad_addr); } - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 6f4f424037c0..5bf191756a4a 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate " - "shadow bitmap\n"); + pr_err("kmemcheck: failed to allocate shadow bitmap\n"); return; } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index dcdcadb69533..dd3c23a801b1 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void) struct test_node *elem; int i; - printk(KERN_INFO "Kmemleak testing\n"); + pr_info("Kmemleak testing\n"); /* make some orphan objects */ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 25c0ad36fe38..e6429926e957 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -276,7 +276,7 @@ static void kmemleak_disable(void); * Print a warning and dump the stack trace. */ #define kmemleak_warn(x...) do { \ - pr_warning(x); \ + pr_warn(x); \ dump_stack(); \ kmemleak_warning = 1; \ } while (0) @@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - pr_warning("Cannot allocate a kmemleak_object structure\n"); + pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); return NULL; } @@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, else if (parent->pointer + parent->size <= ptr) link = &parent->rb_node.rb_right; else { - kmemleak_stop("Cannot insert 0x%lx into the object " - "search tree (overlaps existing)\n", + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* * No need for parent->lock here since "parent" cannot @@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size) object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx " - "(size %zu)\n", ptr, size); + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); #endif return; } @@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Trying to color unknown object " - "at 0x%08lx as %s\n", ptr, + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, (color == KMEMLEAK_GREY) ? "Grey" : (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; @@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - pr_warning("Cannot allocate a scan area\n"); + pr_warn("Cannot allocate a scan area\n"); goto out; } @@ -1463,8 +1462,8 @@ static void kmemleak_scan(void) if (new_leaks) { kmemleak_found_leaks = true; - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); } } @@ -1515,7 +1514,7 @@ static void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("Failed to create the scan thread\n"); + pr_warn("Failed to create the scan thread\n"); scan_thread = NULL; } } @@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work) if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else - pr_info("Kmemleak disabled without freeing internal data. " - "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); @@ -1874,8 +1872,8 @@ void __init kmemleak_init(void) scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warning("Early log buffer exceeded (%d), please increase " - "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", + crt_early_log); /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); @@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("Failed to create the debugfs kmemleak file\n"); + pr_warn("Failed to create the debugfs kmemleak file\n"); mutex_lock(&scan_mutex); start_scan_thread(); mutex_unlock(&scan_mutex); @@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { @@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); + page = follow_page(vma, addr, + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE | + FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); @@ -777,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -788,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void) free_mm_slot(mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); mmdrop(mm); - } else { + } else spin_unlock(&ksm_mmlist_lock); - up_read(&mm->mmap_sem); - } } /* Clean up stable nodes, but don't worry if some are still busy */ @@ -1657,8 +1661,15 @@ next_mm: up_read(&mm->mmap_sem); mmdrop(mm); } else { - spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); + /* + * up_read(&mm->mmap_sem) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ diff --git a/mm/madvise.c b/mm/madvise.c index a01147359f3b..07427d3fcead 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -170,7 +170,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, vma, index); if (page) - page_cache_release(page); + put_page(page); } return 0; @@ -204,14 +204,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, page = find_get_entry(mapping, index); if (!radix_tree_exceptional_entry(page)) { if (page) - page_cache_release(page); + put_page(page); continue; } swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, NULL, 0); if (page) - page_cache_release(page); + put_page(page); } lru_add_drain(); /* Push any new pages onto the LRU now */ diff --git a/mm/memblock.c b/mm/memblock.c index fc7824fa1b42..b570dddb4cb9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 42882c1e7fce..fe787f5c41bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -638,9 +639,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, - unsigned int lru_mask) +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) { unsigned long nr = 0; int zid; @@ -1151,12 +1151,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* oom_info_lock ensures that parallel ooms do not interleave */ - static DEFINE_MUTEX(oom_info_lock); struct mem_cgroup *iter; unsigned int i; - mutex_lock(&oom_info_lock); rcu_read_lock(); if (p) { @@ -1200,7 +1197,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont("\n"); } - mutex_unlock(&oom_info_lock); } /* @@ -1237,7 +1233,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1315,6 +1311,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -2325,9 +2322,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2346,10 +2340,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!mem_cgroup_is_root(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } @@ -2719,39 +2714,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -2817,6 +2821,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; + if (cgroup_memory_nokmem) + return 0; + BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); @@ -2837,24 +2844,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *parent, - struct mem_cgroup *memcg) -{ - int ret = 0; - - mutex_lock(&memcg_limit_mutex); - /* - * If the parent cgroup is not kmem-online now, it cannot be - * onlined after this point, because it has at least one child - * already. - */ - if (memcg_kmem_online(parent) || - (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) - ret = memcg_online_kmem(memcg); - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -2913,10 +2902,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) -{ - return 0; -} static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; @@ -2932,22 +2917,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret = 0; + int ret; mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - ret = -EBUSY; - if (ret) - goto out; - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } ret = page_counter_limit(&memcg->kmem, limit); -out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4198,7 +4171,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - error = memcg_propagate_kmem(parent, memcg); + error = memcg_online_kmem(memcg); if (error) goto fail; @@ -4282,9 +4255,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); - mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); - memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4693,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4702,7 +4679,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4759,6 +4739,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4768,8 +4749,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4878,11 +4860,11 @@ put: /* get_mctgt_type() gets the page */ return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -4894,7 +4876,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -4911,23 +4893,16 @@ retry: * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4937,7 +4912,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5015,6 +4990,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5025,6 +5001,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5046,6 +5027,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5054,9 +5037,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; @@ -5077,6 +5087,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5090,22 +5102,27 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5117,12 +5134,17 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } @@ -5174,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, @@ -5395,6 +5417,10 @@ static void uncharge_list(struct list_head *page_list) struct list_head *next; struct page *page; + /* + * Note that the list can be a single page->lru; hence the + * do-while loop instead of a simple list_for_each_entry(). + */ next = page_list->next; do { unsigned int nr_pages = 1; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 67c30eb993f0..ca5acee53b7a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - printk(KERN_ERR - "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); + pr_info("MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); return ret; } @@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); + pr_err("MCE: Out of memory while machine check handling\n"); return; } } @@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); kfree(tk); @@ -542,7 +538,7 @@ static int delete_from_lru_cache(struct page *p) /* * drop the page count elevated by isolate_lru_page() */ - page_cache_release(p); + put_page(p); return 0; } return -EIO; @@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + pr_err("MCE %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("MCE %#lx: Failed to punch page: %d\n", + pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); @@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", - pfn); + pr_info("MCE %#lx: Failed to invalidate\n", pfn); } return ret; } @@ -854,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - printk(KERN_ERR - "MCE %#lx: %s still referenced by %d users\n", + pr_err("MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -894,7 +888,15 @@ int get_hwpoison_page(struct page *page) } } - return get_page_unless_zero(head); + if (get_page_unless_zero(head)) { + if (head == compound_head(page)) + return 1; + + pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page)); + put_page(head); + } + + return 0; } EXPORT_SYMBOL_GPL(get_hwpoison_page); @@ -934,8 +936,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } if (PageSwapCache(p)) { - printk(KERN_ERR - "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -953,8 +954,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - printk(KERN_INFO - "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -972,8 +972,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1040,16 +1040,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - printk(KERN_ERR - "MCE %#lx: memory outside kernel control\n", - pfn); + pr_err("MCE %#lx: memory outside kernel control\n", pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + pr_err("MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -1180,7 +1178,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + pr_err("MCE %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); diff --git a/mm/memory.c b/mm/memory.c index 0e247642ed5b..07493e34ab7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include <linux/userfaultfd_k.h> #include <asm/io.h> +#include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -562,8 +563,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); @@ -661,9 +661,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_ALERT - "BUG: Bad page map: %lu messages suppressed\n", - nr_unshown); + pr_alert("BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); nr_unshown = 0; } nr_shown = 0; @@ -674,15 +673,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_ALERT - "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - printk(KERN_ALERT - "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ @@ -792,6 +789,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -1105,6 +1142,12 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1123,8 +1166,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1179,15 +1222,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { -#ifdef CONFIG_DEBUG_VM - if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { - pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", - __func__, addr, end, - vma->vm_start, - vma->vm_end); - BUG(); - } -#endif + VM_BUG_ON_VMA(vma_is_anonymous(vma) && + !rwsem_is_locked(&tlb->mm->mmap_sem), vma); split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; @@ -1229,7 +1265,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1237,9 +1273,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -2054,7 +2087,7 @@ static inline int wp_page_reuse(struct mm_struct *mm, VM_BUG_ON_PAGE(PageAnon(page), page); mapping = page->mapping; unlock_page(page); - page_cache_release(page); + put_page(page); if ((dirtied || page_mkwrite) && mapping) { /* @@ -2188,7 +2221,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, } if (new_page) - page_cache_release(new_page); + put_page(new_page); pte_unmap_unlock(page_table, ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2203,14 +2236,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, munlock_vma_page(old_page); unlock_page(old_page); } - page_cache_release(old_page); + put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: - page_cache_release(new_page); + put_page(new_page); oom: if (old_page) - page_cache_release(old_page); + put_page(old_page); return VM_FAULT_OOM; } @@ -2258,7 +2291,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, { int page_mkwrite = 0; - page_cache_get(old_page); + get_page(old_page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; @@ -2267,7 +2300,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, tmp = do_page_mkwrite(vma, old_page, address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(old_page); + put_page(old_page); return tmp; } /* @@ -2281,7 +2314,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); + put_page(old_page); return 0; } page_mkwrite = 1; @@ -2340,8 +2373,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { - page_cache_get(old_page); + get_page(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, @@ -2349,18 +2383,23 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); + put_page(old_page); return 0; } - page_cache_release(old_page); + put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2375,7 +2414,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Ok, we need to copy. Oh, well.. */ - page_cache_get(old_page); + get_page(old_page); pte_unmap_unlock(page_table, ptl); return wp_page_copy(mm, vma, address, page_table, pmd, @@ -2400,7 +2439,6 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) zba = vba; @@ -2435,7 +2473,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2585,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -2619,7 +2657,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * parallel locked swapcache. */ unlock_page(swapcache); - page_cache_release(swapcache); + put_page(swapcache); } if (flags & FAULT_FLAG_WRITE) { @@ -2641,10 +2679,10 @@ out_nomap: out_page: unlock_page(page); out_release: - page_cache_release(page); + put_page(page); if (page != swapcache) { unlock_page(swapcache); - page_cache_release(swapcache); + put_page(swapcache); } return ret; } @@ -2752,7 +2790,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); } @@ -2771,10 +2809,10 @@ unlock: return 0; release: mem_cgroup_cancel_charge(page, memcg, false); - page_cache_release(page); + put_page(page); goto unlock; oom_free_page: - page_cache_release(page); + put_page(page); oom: return VM_FAULT_OOM; } @@ -2807,7 +2845,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - page_cache_release(vmf.page); + put_page(vmf.page); return VM_FAULT_HWPOISON; } @@ -2996,7 +3034,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, false, false); @@ -3024,7 +3062,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { - page_cache_release(new_page); + put_page(new_page); return VM_FAULT_OOM; } @@ -3041,7 +3079,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); } else { /* * The fault handler has no page to lock, so it holds @@ -3057,7 +3095,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); } else { /* * The fault handler has no page to lock, so it holds @@ -3068,7 +3106,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg, false); - page_cache_release(new_page); + put_page(new_page); return ret; } @@ -3096,7 +3134,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, tmp = do_page_mkwrite(vma, fault_page, address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(fault_page); + put_page(fault_page); return tmp; } } @@ -3105,7 +3143,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); unlock_page(fault_page); - page_cache_release(fault_page); + put_page(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, true, false); @@ -3379,6 +3417,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3419,12 +3462,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Use __pte_alloc instead of pte_alloc_map, because we can't + * Use pte_alloc() instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pte_alloc(mm, pmd, address))) return VM_FAULT_OOM; /* * If a huge pmd materialized under us just retry later. Use @@ -3696,7 +3738,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages(tsk, mm, addr, 1, + ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT @@ -3732,7 +3774,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, buf, maddr + offset, bytes); } kunmap(page); - page_cache_release(page); + put_page(page); } len -= bytes; buf += bytes; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24ea06393816..aa34431c3f31 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/bootmem.h> +#include <linux/compaction.h> #include <asm/tlbflush.h> @@ -166,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page, page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); - atomic_inc(&page->_count); + page_ref_inc(page); } void put_page_bootmem(struct page *page) @@ -177,7 +178,7 @@ void put_page_bootmem(struct page *page) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - if (atomic_dec_return(&page->_count) == 1) { + if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); @@ -1054,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); - if (ret) { - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; - } + if (ret) + goto failed_addition; + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -1079,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (need_zonelists_rebuild) zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", - (unsigned long long) pfn << PAGE_SHIFT, - (((unsigned long long) pfn + nr_pages) - << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; + goto failed_addition; } zone->present_pages += onlined_pages; @@ -1094,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1105,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) - kswapd_run(zone_to_nid(zone)); + if (onlined_pages) { + kswapd_run(nid); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1115,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); return 0; + +failed_addition: + pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1526,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { #ifdef CONFIG_DEBUG_VM - printk(KERN_ALERT "removing pfn %lx from LRU failed\n", - pfn); + pr_alert("removing pfn %lx from LRU failed\n", pfn); dump_page(page, "failed to remove from LRU"); #endif put_page(page); @@ -1855,7 +1858,7 @@ repeat: ret = -EBUSY; goto failed_removal; } - printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); @@ -1880,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -1890,9 +1895,9 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", - (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); @@ -1965,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; - pr_warn("removing memory fails, because memory " - "[%pa-%pa] is onlined\n", + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cbc74387df3..36cc01bc950a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -846,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(struct mm_struct *mm, unsigned long addr) +static int lookup_node(unsigned long addr) { struct page *p; int err; - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); @@ -906,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(mm, addr); + err = lookup_node(addr); if (err < 0) goto out; *policy = err; @@ -2559,9 +2559,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { - pr_info("%s automatic NUMA balancing. " - "Configure with numa_balancing= or the " - "kernel.numa_balancing sysctl", + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } diff --git a/mm/mempool.c b/mm/mempool.c index 7924f4f58a6d..9b7a14a791cc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element); + kasan_slab_alloc(pool->pool_data, element, flags); if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data); + kasan_krealloc(element, (size_t)pool->pool_data, flags); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } @@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(mempool_t *pool, gfp_t flags) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element); + kasan_unpoison_element(pool, element, flags); check_element(pool, element); return element; } @@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool); + void *element = remove_element(pool, GFP_KERNEL); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool); + element = remove_element(pool, GFP_KERNEL); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. */ -void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_t wait; gfp_t gfp_temp; + /* If oom killed, memory reserves are essential to prevent livelock */ + VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); + /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (likely(pool->curr_nr)) { + /* + * Don't allocate from emergency reserves if there are + * elements available. This check is racy, but it will + * be rechecked each loop. + */ + gfp_temp |= __GFP_NOMEMALLOC; + } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -336,7 +347,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool); + element = remove_element(pool, gfp_temp); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); @@ -352,11 +363,12 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if (gfp_temp != gfp_mask) { + if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } + gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { diff --git a/mm/migrate.c b/mm/migrate.c index 568284ec75d4..f9dfb18a4eba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); /* No need to invalidate - it was non-present before */ @@ -187,14 +187,17 @@ out: * Get rid of all migration entries and replace them by * references to the indicated page. */ -static void remove_migration_ptes(struct page *old, struct page *new) +void remove_migration_ptes(struct page *old, struct page *new, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; - rmap_walk(new, &rwc); + if (locked) + rmap_walk_locked(new, &rwc); + else + rmap_walk(new, &rwc); } /* @@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { - page_unfreeze_refs(page, expected_count); + page_ref_unfreeze(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -397,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock(&mapping->tree_lock); /* Leave irq disabled to prevent preemption while updating stats */ @@ -451,7 +454,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -463,7 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); @@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page); + remove_migration_ptes(page, page, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page); + rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); out_unlock_both: unlock_page(newpage); @@ -972,7 +975,13 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) { + if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + /* + * With this release, we free successfully migrated + * page and set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, it's how + * HWPoison flag works at the moment. + */ put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); @@ -1070,7 +1079,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_page(new_hpage); @@ -1773,7 +1782,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ if (mm_tlb_flush_pending(mm)) flush_tlb_range(vma, mmun_start, mmun_end); @@ -1829,12 +1841,11 @@ fail_putback: page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); - flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_tlb_range(vma, mmun_start, mmun_end); + flush_pmd_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page, true); diff --git a/mm/mincore.c b/mm/mincore.c index 563f32045490..c0b5ba965200 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) #endif if (page) { present = PageUptodate(page); - page_cache_release(page); + put_page(page); } return present; @@ -211,7 +211,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v * return values: * zero - success * -EFAULT - vec points to an illegal address - * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE + * -EINVAL - addr is not a multiple of PAGE_SIZE * -ENOMEM - Addresses in the range [addr, addr + len] are * invalid for the address space of this process, or * specify one or more pages which are not currently @@ -226,14 +226,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, unsigned char *tmp; /* Check the start address: needs to be page-aligned.. */ - if (start & ~PAGE_CACHE_MASK) + if (start & ~PAGE_MASK) return -EINVAL; /* ..and we need to be passed a valid user-space range */ if (!access_ok(VERIFY_READ, (void __user *) start, len)) return -ENOMEM; - /* This also avoids any overflows on PAGE_CACHE_ALIGN */ + /* This also avoids any overflows on PAGE_ALIGN */ pages = len >> PAGE_SHIFT; pages += (offset_in_page(len)) != 0; diff --git a/mm/mm_init.c b/mm/mm_init.c index fdadf918de76..5b72266b4b03 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void) /* Iterate the zonelist */ for_each_zone_zonelist(zone, z, zonelist, zoneid) { #ifdef CONFIG_NUMA - printk(KERN_CONT "%d:%s ", - zone->node, zone->name); + pr_cont("%d:%s ", zone->node, zone->name); #else - printk(KERN_CONT "0:%s ", zone->name); + pr_cont("0:%s ", zone->name); #endif /* CONFIG_NUMA */ } - printk(KERN_CONT "\n"); + pr_cont("\n"); } } } diff --git a/mm/mmap.c b/mm/mmap.c index 90e3b869a8b9..bd2e1a533bc1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -37,12 +37,12 @@ #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/rbtree_augmented.h> -#include <linux/sched/sysctl.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> +#include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -123,130 +123,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) } } - -int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -/* - * Make sure vm_committed_as in one cacheline and not cacheline shared with - * other variables. It can be updated by several CPUs frequently. - */ -struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; - -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} -EXPORT_SYMBOL_GPL(vm_memory_committed); - -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - /* * Requires inode->i_mapping->i_mmap_rwsem */ @@ -1270,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; + int pkey = 0; *populate = 0; @@ -1309,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(mm); + if (pkey < 0) + pkey = 0; + } + /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -2642,9 +2525,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); if (prot) return ret; @@ -3010,8 +2892,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit " - "%lu. Will be forbidden soon.\n", + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5fbdd367bbed..f4259e496f83 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter <clameter@sgi.com> + * Christoph Lameter <cl@linux.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/mm/mprotect.c b/mm/mprotect.c index f7cb3d4d9c2e..b650c5412f58 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -24,6 +24,7 @@ #include <linux/migrate.h> #include <linux/perf_event.h> #include <linux/ksm.h> +#include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -354,10 +355,13 @@ fail: SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { - unsigned long vm_flags, nstart, end, tmp, reqprot; + unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + const bool rier = (current->personality & READ_IMPLIES_EXEC) && + (prot & PROT_READ); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; @@ -374,13 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return -EINVAL; reqprot = prot; - /* - * Does the application expect PROT_READ to imply PROT_EXEC: - */ - if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - prot |= PROT_EXEC; - - vm_flags = calc_vm_prot_bits(prot); down_write(¤t->mm->mmap_sem); @@ -411,10 +408,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; + int pkey = arch_override_mprotect_pkey(vma, prot, -1); /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags; + /* Does the application expect PROT_READ to imply PROT_EXEC */ + if (rier && (vma->vm_flags & VM_MAYEXEC)) + prot |= PROT_EXEC; + + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ @@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -ENOMEM; goto out; } + prot = reqprot; } out: up_write(¤t->mm->mmap_sem); diff --git a/mm/mremap.c b/mm/mremap.c index 8eeba02fc991..3fa0a467df66 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -20,7 +20,6 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> -#include <linux/sched/sysctl.h> #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> @@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } - if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, - new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 99feb2b07fc5..bd05a70f44b9 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1d6c9..c8bd59a03c71 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,7 +33,6 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> -#include <linux/sched/sysctl.h> #include <linux/printk.h> #include <asm/uaccess.h> @@ -48,33 +47,11 @@ struct page *mem_map; unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -struct percpu_counter vm_committed_as; -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} - -EXPORT_SYMBOL_GPL(vm_memory_committed); - EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ @@ -162,7 +139,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) { pages[i] = virt_to_page(start); if (pages[i]) - page_cache_get(pages[i]); + get_page(pages[i]); } if (vmas) vmas[i] = vma; @@ -182,8 +159,7 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { @@ -194,18 +170,16 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages(current, current->mm, start, nr_pages, flags, + pages, vmas, NULL); } EXPORT_SYMBOL(get_user_pages); -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked) +long get_user_pages_locked(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) { - return get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + return get_user_pages(start, nr_pages, write, force, pages, NULL); } EXPORT_SYMBOL(get_user_pages_locked); @@ -216,19 +190,18 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, { long ret; down_read(&mm->mmap_sem); - ret = get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, + NULL, NULL); up_read(&mm->mmap_sem); return ret; } EXPORT_SYMBOL(__get_user_pages_unlocked); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, 0); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, 0); } EXPORT_SYMBOL(get_user_pages_unlocked); @@ -1084,7 +1057,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ if (!(capabilities & NOMMU_MAP_DIRECT)) { @@ -1829,100 +1802,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some 3% for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; - -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e97a05d9621f..86349586eacb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include <linux/freezer.h> #include <linux/ftrace.h> #include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/init.h> + +#include <asm/tlb.h> +#include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/oom.h> @@ -287,9 +292,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } @@ -386,8 +388,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " - "oom_score_adj=%hd\n", + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); @@ -409,6 +410,176 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#define K(x) ((x) << (PAGE_SHIFT-10)) + +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static struct task_struct *oom_reaper_list; +static DEFINE_SPINLOCK(oom_reaper_lock); + + +static bool __oom_reap_task(struct task_struct *tsk) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); + return true; + } + + task_unlock(p); + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + task_pid_nr(tsk), tsk->comm, + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES))); + up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); +out: + mmput(mm); + return ret; +} + +#define MAX_OOM_REAP_RETRIES 10 +static void oom_reap_task(struct task_struct *tsk) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + schedule_timeout_idle(HZ/10); + + if (attempts > MAX_OOM_REAP_RETRIES) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); + } + + /* Drop a reference taken by wake_oom_reaper */ + put_task_struct(tsk); +} + +static int oom_reaper(void *unused) +{ + set_freezable(); + + while (true) { + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + + return 0; +} + +static void wake_oom_reaper(struct task_struct *tsk) +{ + if (!oom_reaper_th) + return; + + /* tsk is already queued? */ + if (tsk == oom_reaper_list || tsk->oom_reaper_list) + return; + + get_task_struct(tsk); + + spin_lock(&oom_reaper_lock); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; + spin_unlock(&oom_reaper_lock); + wake_up(&oom_reaper_wait); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -435,9 +606,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -459,15 +631,11 @@ void exit_oom_victim(void) bool oom_killer_disable(void) { /* - * Make sure to not race with an ongoing OOM killer - * and that the current is not the victim. + * Make sure to not race with an ongoing OOM killer. Check that the + * current is not killed (possibly due to sharing the victim's memory). */ - mutex_lock(&oom_lock); - if (test_thread_flag(TIF_MEMDIE)) { - mutex_unlock(&oom_lock); + if (mutex_lock_killable(&oom_lock)) return false; - } - oom_killer_disabled = true; mutex_unlock(&oom_lock); @@ -502,7 +670,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } -#define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -518,6 +685,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -608,17 +776,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) - continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(victim); + mmdrop(mm); put_task_struct(victim); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 11ff8f758631..bc5149d5ec38 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1910,7 +1910,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { @@ -1924,7 +1925,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } @@ -2176,8 +2178,8 @@ int write_cache_pages(struct address_space *mapping, cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -2382,14 +2384,14 @@ int write_one_page(struct page *page, int wait) wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - page_cache_get(page); + get_page(page); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; } - page_cache_release(page); + put_page(page); } else { unlock_page(page); } @@ -2431,7 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); __inc_wb_stat(wb, WB_DIRTIED); - task_io_account_write(PAGE_CACHE_SIZE); + task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); } @@ -2450,7 +2452,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); - task_io_account_cancelled_write(PAGE_CACHE_SIZE); + task_io_account_cancelled_write(PAGE_SIZE); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c46b75d14b6f..c1069efcc4d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -307,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { + unsigned long max_initialise; + /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > max_initialise) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -498,6 +506,7 @@ void prep_compound_page(struct page *page, unsigned int order) unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -542,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value\n"); return 0; } _debug_guardpage_minorder = res; - printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -683,34 +692,28 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - unsigned int max_order = MAX_ORDER; + unsigned int max_order; + + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { - /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. - */ - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); - } else { + if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - } - page_idx = pfn & ((1 << max_order) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging: while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) - break; + goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -727,6 +730,32 @@ static inline void __free_one_page(struct page *page, page_idx = combined_idx; order++; } + if (max_order < MAX_ORDER) { + /* If we are here, it means order is >= pageblock_order. + * We want to prevent merge between freepages on isolate + * pageblock and normal pageblock. Without this, pageblock + * isolation could cause incorrect freepage or CMA accounting. + * + * We don't want to hit this code for the more frequent + * low-order merging. + */ + if (unlikely(has_isolate_pageblock(zone))) { + int buddy_mt; + + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (is_migrate_isolate(migratetype) || + is_migrate_isolate(buddy_mt))) + goto done_merging; + } + max_order++; + goto continue_merging; + } + +done_merging: set_page_order(page, order); /* @@ -764,7 +793,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1460,7 +1489,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -2348,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, list_del(&page->lru); pcp->count--; } else { - if (unlikely(gfp_flags & __GFP_NOFAIL)) { - /* - * __GFP_NOFAIL is not to be used in new code. - * - * All __GFP_NOFAIL callers should be fixed so that they - * properly detect and handle allocation failures. - * - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with - * __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - } + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); page = NULL; @@ -2857,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per tradition. + * + * But do not keep looping if oom_killer_disable() + * was already called, for the system is trying to + * enter a quiescent state during suspend. */ - *did_some_progress = 1; + *did_some_progress = !oom_killer_disabled; goto out; } if (pm_suspended_storage()) @@ -3117,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3481,7 +3498,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3493,7 +3510,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3501,7 +3518,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -3712,6 +3729,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -4044,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s) } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { - printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4510,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -5404,6 +5461,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -5428,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } @@ -5637,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { - printk(KERN_WARNING - "Could not find start_pfn for node %d\n", nid); + pr_warn("Could not find start_pfn for node %d\n", nid); return 0; } @@ -6110,22 +6168,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** @@ -6300,8 +6357,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6419,7 +6485,7 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_wmark_min) +core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -6442,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -6633,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, - (1UL << log2qty), - ilog2(size) - PAGE_SHIFT, - size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; @@ -6788,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; @@ -7138,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); + pr_info("remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); @@ -7152,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -7171,4 +7248,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif diff --git a/mm/page_io.c b/mm/page_io.c index b995a5ba5e8f..985f23cfa79b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -56,31 +56,20 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Write-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); bio_put(bio); } -static void end_swap_bio_read(struct bio *bio) +static void swap_slot_free_notify(struct page *page) { - struct page *page = bio->bi_io_vec[0].bv_page; - - if (bio->bi_error) { - SetPageError(page); - ClearPageUptodate(page); - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); - goto out; - } - - SetPageUptodate(page); + struct swap_info_struct *sis; + struct gendisk *disk; /* * There is no guarantee that the page is in swap cache - the software @@ -88,42 +77,59 @@ static void end_swap_bio_read(struct bio *bio) * swapcache page. So we must check PG_swapcache before proceeding with * this optimization. */ - if (likely(PageSwapCache(page))) { - struct swap_info_struct *sis; + if (unlikely(!PageSwapCache(page))) + return; - sis = page_swap_info(page); - if (sis->flags & SWP_BLKDEV) { - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - struct gendisk *disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; - unsigned long offset; + sis = page_swap_info(page); + if (!(sis->flags & SWP_BLKDEV)) + return; - entry.val = page_private(page); - offset = swp_offset(entry); + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } - } + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); } +} + +static void end_swap_bio_read(struct bio *bio) +{ + struct page *page = bio->bi_io_vec[0].bv_page; + if (bio->bi_error) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert("Read-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + goto out; + } + + SetPageUptodate(page); + swap_slot_free_notify(page); out: unlock_page(page); bio_put(bio); @@ -216,7 +222,7 @@ reprobe: out: return ret; bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); + pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } @@ -246,7 +252,7 @@ out: static sector_t swap_page_sector(struct page *page) { - return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); + return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); } int __swap_writepage(struct page *page, struct writeback_control *wbc, @@ -290,8 +296,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, */ set_page_dirty(page); ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", - page_file_offset(page)); + pr_err_ratelimited("Write error on dio swapfile (%llu)\n", + page_file_offset(page)); } end_page_writeback(page); return ret; @@ -347,6 +353,11 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { + if (trylock_page(page)) { + swap_slot_free_notify(page); + unlock_page(page); + } + count_vm_event(PSWPIN); return 0; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 92c4c36501e7..c4f568206544 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -215,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 1 if all pages in the range are isolated. + * Returns the last tested pfn. */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, @@ -289,11 +289,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * now as a simple work-around, we use the next node for destination. */ if (PageHuge(page)) { - nodemask_t src = nodemask_of_node(page_to_nid(page)); - nodemask_t dst; - nodes_complement(dst, src); + int node = next_online_node(page_to_nid(page)); + if (node == MAX_NUMNODES) + node = first_online_node; return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node(page_to_nid(page), dst)); + node); } if (PageHighMem(page)) diff --git a/mm/page_owner.c b/mm/page_owner.c index 44ad1f00c4e1..ac3d8d129974 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -198,9 +198,8 @@ void __dump_page_owner(struct page *page) return; } - pr_alert("page allocated via order %u, migratetype %s, " - "gfp_mask %#x(%pGg)\n", page_ext->order, - migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); print_stack_trace(&trace, 0); if (page_ext->last_migrate_reason != -1) diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 10e3d0b8a86d..d66911ff42d9 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + pr_crit("can't handle more than one group\n"); return -EINVAL; } @@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", - alloc_pages - nr_pages); + pr_warn("wasting %zu pages per chunk\n", + alloc_pages - nr_pages); return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 998607adf6eb..0c59684f1ff2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -53,6 +53,8 @@ * setup the first chunk containing the kernel static percpu area */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bitmap.h> #include <linux/bootmem.h> #include <linux/err.h> @@ -888,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); + WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); return NULL; } @@ -1033,11 +1035,11 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) - pr_info("PERCPU: limit reached, disable warning\n"); + pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_blance_workfn() */ @@ -1449,20 +1451,20 @@ static void pcpu_dump_alloc_info(const char *lvl, for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { - printk(KERN_CONT "\n"); + pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } - printk(KERN_CONT "[%0*d] ", group_width, group); + pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) - printk(KERN_CONT "%0*d ", cpu_width, - gi->cpu_map[unit]); + pr_cont("%0*d ", + cpu_width, gi->cpu_map[unit]); else - printk(KERN_CONT "%s ", empty_str); + pr_cont("%s ", empty_str); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } /** @@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ - pr_emerg("PERCPU: failed to initialize, %s", #cond); \ - pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + pr_emerg("failed to initialize, %s\n", #cond); \ + pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ @@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warning("PERCPU: unknown allocator %s specified\n", str); + pr_warn("unknown allocator %s specified\n", str); return 0; } @@ -2016,9 +2018,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " - "space 0x%lx\n", max_distance, - VMALLOC_TOTAL); + pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -2026,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif } - pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); @@ -2100,8 +2101,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate %s page " - "for cpu%u\n", psize_str, cpu); + pr_warn("failed to allocate %s page for cpu%u\n", + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ @@ -2140,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 06a005b979a7..71c5f9109f2a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE - -/* - * ARCHes with special requirements for evicting THP backing TLB entries can - * implement this. Otherwise also, it can help optimize normal TLB flush in - * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB if flush span is greater than a threshold, which will - * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desirable. - * e.g. see arch/arc: flush_pmd_tlb_range - */ -#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5d453e58ddbf..07514d41ebcc 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr, int pages = min(nr_pages, max_pages_per_loop); size_t bytes; - /* Get the pages we're interested in */ - pages = get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages); + /* + * Get the pages we're interested in. We must + * add FOLL_REMOTE because task/mm might not + * current/current->mm + */ + pages = __get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages, + FOLL_REMOTE); if (pages <= 0) return -EFAULT; diff --git a/mm/quicklist.c b/mm/quicklist.c index 942212970529..daf6ff6e199a 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -8,7 +8,7 @@ * improved on it. * * Copyright (C) 2007 SGI, - * Christoph Lameter <clameter@sgi.com> + * Christoph Lameter <cl@linux.com> * Generalized, added support for multiple lists and * constructors / destructors. */ diff --git a/mm/readahead.c b/mm/readahead.c index 20e58e820e44..40be3ae0afe3 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -47,11 +47,11 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping, if (!trylock_page(page)) BUG(); page->mapping = mapping; - do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + do_invalidatepage(page, 0, PAGE_SIZE); page->mapping = NULL; unlock_page(page); } - page_cache_release(page); + put_page(page); } /* @@ -93,14 +93,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, read_cache_pages_invalidate_page(mapping, page); continue; } - page_cache_release(page); + put_page(page); ret = filler(data, page); if (unlikely(ret)) { read_cache_pages_invalidate_pages(mapping, pages); break; } - task_io_account_read(PAGE_CACHE_SIZE); + task_io_account_read(PAGE_SIZE); } return ret; } @@ -130,7 +130,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } - page_cache_release(page); + put_page(page); } ret = 0; @@ -163,7 +163,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, if (isize == 0) goto out; - end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + end_index = ((isize - 1) >> PAGE_SHIFT); /* * Preallocate as many pages as we will need. @@ -216,7 +216,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, while (nr_to_read) { int err; - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; @@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping, * trivial case: (offset - prev_offset) == 1 * unaligned reads: (offset - prev_offset) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT; if (offset - prev_offset <= 1UL) goto initial_readahead; @@ -558,8 +558,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count) if (f.file) { if (f.file->f_mode & FMODE_READ) { struct address_space *mapping = f.file->f_mapping; - pgoff_t start = offset >> PAGE_CACHE_SHIFT; - pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + count - 1) >> PAGE_SHIFT; unsigned long len = end - start + 1; ret = do_readahead(mapping, f.file, start, len); } diff --git a/mm/rmap.c b/mm/rmap.c index 02f0bfc3c80a..307b555024ef 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -569,19 +569,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -static void percpu_flush_tlb_batch_pages(void *data) -{ - /* - * All TLB entries are flushed on the assumption that it is - * cheaper to flush all TLBs and let them be refilled than - * flushing individual PFNs. Note that we do not track mm's - * to flush as that might simply be multiple full TLB flushes - * for no gain. - */ - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - flush_tlb_local(); -} - /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed @@ -598,15 +585,14 @@ void try_to_unmap_flush(void) cpu = get_cpu(); - trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); - - if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) - percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); - - if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { - smp_call_function_many(&tlb_ubc->cpumask, - percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); } + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) + flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL); cpumask_clear(&tlb_ubc->cpumask); tlb_ubc->flush_required = false; tlb_ubc->writable = false; @@ -1431,6 +1417,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_address(vma, address, + flags & TTU_MIGRATION, page); + /* check if we have anything to do after split */ + if (page_mapcount(page) == 0) + goto out; + } + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1547,7 +1541,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, discard: page_remove_rmap(page, PageHuge(page)); - page_cache_release(page); + put_page(page); out_unmap: pte_unmap_unlock(pte, ptl); @@ -1576,10 +1570,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_mapcount_is_zero(struct page *page) { - return !page_mapped(page); -}; + return !page_mapcount(page); +} /** * try_to_unmap - try to remove all page table mappings to a page @@ -1606,12 +1600,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = &rp, - .done = page_not_mapped, + .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); - /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1623,9 +1615,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; - ret = rmap_walk(page, &rwc); + if (flags & TTU_RMAP_LOCKED) + ret = rmap_walk_locked(page, &rwc); + else + ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) { ret = SWAP_SUCCESS; if (rp.lazyfreed && !PageDirty(page)) ret = SWAP_LZFREE; @@ -1633,6 +1628,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1715,14 +1715,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page, rwc); + if (locked) { + anon_vma = page_anon_vma(page); + /* anon_vma disappear under us? */ + VM_BUG_ON_PAGE(!anon_vma, page); + } else { + anon_vma = rmap_walk_anon_lock(page, rwc); + } if (!anon_vma) return ret; @@ -1742,7 +1749,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (rwc->done && rwc->done(page)) break; } - anon_vma_unlock_read(anon_vma); + + if (!locked) + anon_vma_unlock_read(anon_vma); return ret; } @@ -1759,9 +1768,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + bool locked) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1778,7 +1788,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) return ret; pgoff = page_to_pgoff(page); - i_mmap_lock_read(mapping); + if (!locked) + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1795,7 +1806,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) } done: - i_mmap_unlock_read(mapping); + if (!locked) + i_mmap_unlock_read(mapping); return ret; } @@ -1804,9 +1816,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc); + return rmap_walk_anon(page, rwc, false); + else + return rmap_walk_file(page, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_PAGE(PageKsm(page), page); + if (PageAnon(page)) + return rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc); + return rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/shmem.c b/mm/shmem.c index 1acfdbc4bd9e..719bd6b88d98 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -75,8 +75,8 @@ static struct vfsmount *shm_mnt; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) -#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) +#define BLOCKS_PER_PAGE (PAGE_SIZE/512) +#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 @@ -176,13 +176,13 @@ static inline int shmem_reacct_size(unsigned long flags, static inline int shmem_acct_block(unsigned long flags) { return (flags & VM_NORESERVE) ? - security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; + security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) { if (flags & VM_NORESERVE) - vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE)); + vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } static const struct super_operations shmem_ops; @@ -300,7 +300,7 @@ static int shmem_add_to_page_cache(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - page_cache_get(page); + get_page(page); page->mapping = mapping; page->index = index; @@ -318,7 +318,7 @@ static int shmem_add_to_page_cache(struct page *page, } else { page->mapping = NULL; spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + put_page(page); } return error; } @@ -338,7 +338,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) __dec_zone_page_state(page, NR_FILE_PAGES); __dec_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); + put_page(page); BUG_ON(error); } @@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { if (iter.index >= end) break; page = radix_tree_deref_slot(slot); - /* - * This should only be possible to happen at index 0, so we - * don't need to reset the counter, nor do we risk infinite - * restarts. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (radix_tree_exceptional_entry(page)) swapped++; if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } @@ -479,10 +474,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; - unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; + pgoff_t end = (lend + 1) >> PAGE_SHIFT; + unsigned int partial_start = lstart & (PAGE_SIZE - 1); + unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); struct pagevec pvec; pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; @@ -535,7 +530,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct page *page = NULL; shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); if (page) { - unsigned int top = PAGE_CACHE_SIZE; + unsigned int top = PAGE_SIZE; if (start > end) { top = partial_end; partial_end = 0; @@ -543,7 +538,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, zero_user_segment(page, partial_start, top); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (partial_end) { @@ -553,7 +548,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, zero_user_segment(page, 0, partial_end); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (start >= end) @@ -838,7 +833,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); - page_cache_release(page); + put_page(page); return error; } @@ -1085,7 +1080,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, if (!newpage) return -ENOMEM; - page_cache_get(newpage); + get_page(newpage); copy_highpage(newpage, oldpage); flush_dcache_page(newpage); @@ -1125,8 +1120,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, set_page_private(oldpage, 0); unlock_page(oldpage); - page_cache_release(oldpage); - page_cache_release(oldpage); + put_page(oldpage); + put_page(oldpage); return error; } @@ -1150,7 +1145,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, int once = 0; int alloced = 0; - if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: swap.val = 0; @@ -1161,7 +1156,7 @@ repeat: } if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; } @@ -1174,7 +1169,7 @@ repeat: if (sgp != SGP_READ) goto clear; unlock_page(page); - page_cache_release(page); + put_page(page); page = NULL; } if (page || (sgp == SGP_READ && !swap.val)) { @@ -1332,7 +1327,7 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && - ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); delete_from_page_cache(page); @@ -1360,7 +1355,7 @@ failed: unlock: if (page) { unlock_page(page); - page_cache_release(page); + put_page(page); } if (error == -ENOSPC && !once++) { info = SHMEM_I(inode); @@ -1582,7 +1577,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); - pgoff_t index = pos >> PAGE_CACHE_SHIFT; + pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ if (unlikely(info->seals)) { @@ -1606,16 +1601,16 @@ shmem_write_end(struct file *file, struct address_space *mapping, i_size_write(inode, pos + copied); if (!PageUptodate(page)) { - if (copied < PAGE_CACHE_SIZE) { - unsigned from = pos & (PAGE_CACHE_SIZE - 1); + if (copied < PAGE_SIZE) { + unsigned from = pos & (PAGE_SIZE - 1); zero_user_segments(page, 0, from, - from + copied, PAGE_CACHE_SIZE); + from + copied, PAGE_SIZE); } SetPageUptodate(page); } set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); return copied; } @@ -1640,8 +1635,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!iter_is_iovec(to)) sgp = SGP_DIRTY; - index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_SHIFT; + offset = *ppos & ~PAGE_MASK; for (;;) { struct page *page = NULL; @@ -1649,11 +1644,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long nr, ret; loff_t i_size = i_size_read(inode); - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (index > end_index) break; if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; + nr = i_size & ~PAGE_MASK; if (nr <= offset) break; } @@ -1671,14 +1666,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * We must evaluate after, since reads (unlike writes) * are called without i_mutex protection against truncate */ - nr = PAGE_CACHE_SIZE; + nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_CACHE_SHIFT; + end_index = i_size >> PAGE_SHIFT; if (index == end_index) { - nr = i_size & ~PAGE_CACHE_MASK; + nr = i_size & ~PAGE_MASK; if (nr <= offset) { if (page) - page_cache_release(page); + put_page(page); break; } } @@ -1699,7 +1694,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) mark_page_accessed(page); } else { page = ZERO_PAGE(0); - page_cache_get(page); + get_page(page); } /* @@ -1709,10 +1704,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ret = copy_page_to_iter(page, offset, nr, to); retval += ret; offset += ret; - index += offset >> PAGE_CACHE_SHIFT; - offset &= ~PAGE_CACHE_MASK; + index += offset >> PAGE_SHIFT; + offset &= ~PAGE_MASK; - page_cache_release(page); + put_page(page); if (!iov_iter_count(to)) break; if (ret < nr) { @@ -1722,7 +1717,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } @@ -1760,9 +1755,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (splice_grow_spd(pipe, &spd)) return -ENOMEM; - index = *ppos >> PAGE_CACHE_SHIFT; - loff = *ppos & ~PAGE_CACHE_MASK; - req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; + loff = *ppos & ~PAGE_MASK; + req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, @@ -1779,7 +1774,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index++; } - index = *ppos >> PAGE_CACHE_SHIFT; + index = *ppos >> PAGE_SHIFT; nr_pages = spd.nr_pages; spd.nr_pages = 0; @@ -1789,7 +1784,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (!len) break; - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_SIZE - loff); page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { @@ -1798,19 +1793,19 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (error) break; unlock_page(page); - page_cache_release(spd.pages[page_nr]); + put_page(spd.pages[page_nr]); spd.pages[page_nr] = page; } isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + end_index = (isize - 1) >> PAGE_SHIFT; if (unlikely(!isize || index > end_index)) break; if (end_index == index) { unsigned int plen; - plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + plen = ((isize - 1) & ~PAGE_MASK) + 1; if (plen <= loff) break; @@ -1827,7 +1822,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, } while (page_nr < nr_pages) - page_cache_release(spd.pages[page_nr++]); + put_page(spd.pages[page_nr++]); if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); @@ -1909,10 +1904,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) else if (offset >= inode->i_size) offset = -ENXIO; else { - start = offset >> PAGE_CACHE_SHIFT; - end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = offset >> PAGE_SHIFT; + end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_CACHE_SHIFT; + new_offset <<= PAGE_SHIFT; if (new_offset > offset) { if (new_offset < inode->i_size) offset = new_offset; @@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } } else if (page_count(page) - page_mapcount(page) > 1) { spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, @@ -1962,8 +1958,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } page = NULL; } @@ -2032,8 +2028,7 @@ restart: continue_resched: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2208,8 +2203,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } - start = offset >> PAGE_CACHE_SHIFT; - end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = offset >> PAGE_SHIFT; + end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Try to avoid a swapstorm if len is impossible to satisfy */ if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { error = -ENOSPC; @@ -2242,8 +2237,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, - (loff_t)start << PAGE_CACHE_SHIFT, - (loff_t)index << PAGE_CACHE_SHIFT, true); + (loff_t)start << PAGE_SHIFT, + (loff_t)index << PAGE_SHIFT, true); goto undone; } @@ -2264,7 +2259,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, */ set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); cond_resched(); } @@ -2285,7 +2280,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; - buf->f_bsize = PAGE_CACHE_SIZE; + buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; @@ -2528,7 +2523,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s struct shmem_inode_info *info; len = strlen(symname) + 1; - if (len > PAGE_CACHE_SIZE) + if (len > PAGE_SIZE) return -ENAMETOOLONG; inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); @@ -2567,7 +2562,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s SetPageUptodate(page); set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; @@ -2823,9 +2818,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { - printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", - this_char); + pr_err("tmpfs: No value for mount option '%s'\n", + this_char); goto error; } @@ -2841,7 +2835,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (*rest) goto bad_val; sbinfo->max_blocks = - DIV_ROUND_UP(size, PAGE_CACHE_SIZE); + DIV_ROUND_UP(size, PAGE_SIZE); } else if (!strcmp(this_char,"nr_blocks")) { sbinfo->max_blocks = memparse(value, &rest); if (*rest) @@ -2880,8 +2874,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (mpol_parse_str(value, &mpol)) goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", - this_char); + pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; } } @@ -2889,7 +2882,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); error: mpol_put(mpol); @@ -2947,7 +2940,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (sbinfo->max_blocks != shmem_default_max_blocks()) seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10)); + sbinfo->max_blocks << (PAGE_SHIFT - 10)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (S_IRWXUGO | S_ISVTX)) @@ -3089,8 +3082,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; @@ -3286,14 +3279,14 @@ int __init shmem_init(void) error = register_filesystem(&shmem_fs_type); if (error) { - printk(KERN_ERR "Could not register tmpfs\n"); + pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); - printk(KERN_ERR "Could not kern_mount tmpfs\n"); + pr_err("Could not kern_mount tmpfs\n"); goto out1; } return 0; diff --git a/mm/slab.c b/mm/slab.c index 852fc5c79829..17e2848979c5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -474,7 +474,7 @@ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) { - printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + pr_err("slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif @@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) @@ -1551,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x: ", offset); + pr_err("%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; @@ -1564,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + pr_err("Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + pr_err("Run memtest86+ or a similar memory test tool.\n"); #else - printk(KERN_ERR "Run a memory test tool.\n"); + pr_err("Run a memory test tool.\n"); #endif } } @@ -1585,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + pr_err("Last user: [<%p>](%pSR)\n", *dbg_userword(cachep, objp), *dbg_userword(cachep, objp)); } @@ -1627,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - printk(KERN_ERR - "Slab corruption (%s): %s start=%p, len=%d\n", - print_tainted(), cachep->name, realobj, size); + pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1656,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + pr_err("Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + pr_err("Next obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1691,11 +1689,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -2090,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) } #endif + kasan_cache_create(cachep, &size, &flags); + size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement @@ -2391,16 +2389,19 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { @@ -2415,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; + void *objp; cache_init_objs_debug(cachep, page); @@ -2425,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep, for (i = 0; i < cachep->num; i++) { /* constructor could break poison info */ - if (DEBUG == 0 && cachep->ctor) - cachep->ctor(index_to_obj(cachep, page, i)); + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } set_free_obj(page, i, i); } @@ -2467,8 +2473,8 @@ static void slab_put_obj(struct kmem_cache *cachep, /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + pr_err("slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } @@ -2556,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -2587,7 +2594,7 @@ failed: static void kfree_debugcheck(const void *objp) { if (!virt_addr_valid(objp)) { - printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + pr_err("kfree_debugcheck: out of range ptr %lxh\n", (unsigned long)objp); BUG(); } @@ -2611,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", - obj, redzone1, redzone2); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, @@ -2899,12 +2906,10 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + slab_error(cachep, "double free, or memory outside object was overwritten"); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; @@ -2915,7 +2920,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -3324,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, { struct array_cache *ac = cpu_cache_get(cachep); + kasan_slab_free(cachep, objp); + check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3371,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3436,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3459,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3477,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3489,11 +3500,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size, flags); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3527,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -3842,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) - printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); return err; } @@ -3998,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) name = cachep->name; if (error) - printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + pr_err("slab: cache %s error: %s\n", name, error); sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; @@ -4026,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); @@ -4299,10 +4314,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size, GFP_NOWAIT); + + return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab.h b/mm/slab.h index b7934361f026..5969769fbee6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } @@ -379,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - kasan_slab_alloc(s, object); + kasan_slab_alloc(s, object, flags); } memcg_kmem_put_cache(s); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 6afb2263a5c5..3239bfd758e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) + SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_NOTRACK | SLAB_ACCOUNT) @@ -442,7 +442,7 @@ out_unlock: panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); else { - printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + pr_warn("kmem_cache_create(%s) failed with error %d\n", name, err); dump_stack(); } @@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_online(memcg)) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); @@ -726,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s, &release, &need_rcu_barrier); if (err) { - pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); + pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); dump_stack(); } out_unlock: @@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size); + kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1047,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m) #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); + seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); #ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " - "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); #endif seq_putc(m, '\n'); @@ -1194,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size); + kasan_krealloc((void *)p, new_size, flags); return (void *)p; } diff --git a/mm/slub.c b/mm/slub.c index 6c91324f9370..4dbb109eb8cd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } @@ -951,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); + slab_err(s, page, "Wrong number of objects. Found %d but should be %d", + page->objects, max_objects); page->objects = max_objects; slab_fix(s, "Number of objects adjusted."); } if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", + page->inuse, page->objects - nr); page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } @@ -1118,8 +1117,8 @@ static inline int free_consistency_checks(struct kmem_cache *s, if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); + slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + object); } else if (!page->slab_cache) { pr_err("SLUB <none>: no slab for object 0x%p.\n", object); @@ -1314,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size); + kasan_kmalloc_large(ptr, size, flags); } static inline void kfree_hook(const void *x) @@ -1427,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1540,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ @@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3439,10 +3440,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } @@ -3562,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3607,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3636,7 +3636,7 @@ size_t ksize(const void *object) size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, so we need unpoison this area. */ - kasan_krealloc(object, size); + kasan_krealloc(object, size, GFP_NOWAIT); return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b60802b3e5ea..68885dcbaf40 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode " - "page_structs\n", start, end - 1); + pr_warn("[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 3717ceed4177..5d0cf4540364 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); if (usemap_nid != nid) { - printk(KERN_INFO - "node %d must be removed before remove section %ld\n", - nid, usemap_snr); + pr_info("node %d must be removed before remove section %ld\n", + nid, usemap_snr); return; } /* @@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) * gather other removable sections for dynamic partitioning. * Just notify un-removable section's number here. */ - printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, - pgdat_snr, nid); - printk(KERN_CONT - " have a circular dependency on usemap and pgdat allocations\n"); + pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", + usemap_snr, pgdat_snr, nid); } #else static unsigned long * __init @@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); + pr_warn("%s: allocation failed\n", __func__); return; } @@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; } } @@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; return NULL; } diff --git a/mm/swap.c b/mm/swap.c index 09fe5e97714a..03aacbcb013f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -114,7 +114,7 @@ void put_pages_list(struct list_head *pages) victim = list_entry(pages->prev, struct page, lru); list_del(&victim->lru); - page_cache_release(victim); + put_page(victim); } } EXPORT_SYMBOL(put_pages_list); @@ -142,7 +142,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, return seg; pages[seg] = kmap_to_page(kiov[seg].iov_base); - page_cache_get(pages[seg]); + get_page(pages[seg]); } return seg; @@ -236,7 +236,7 @@ void rotate_reclaimable_page(struct page *page) struct pagevec *pvec; unsigned long flags; - page_cache_get(page); + get_page(page); local_irq_save(flags); pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) @@ -294,7 +294,7 @@ void activate_page(struct page *page) if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); - page_cache_get(page); + get_page(page); if (!pagevec_add(pvec, page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); put_cpu_var(activate_page_pvecs); @@ -389,7 +389,7 @@ static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - page_cache_get(page); + get_page(page); if (!pagevec_space(pvec)) __pagevec_lru_add(pvec); pagevec_add(pvec, page); @@ -646,7 +646,7 @@ void deactivate_page(struct page *page) if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); - page_cache_get(page); + get_page(page); if (!pagevec_add(pvec, page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); put_cpu_var(lru_deactivate_pvecs); @@ -698,7 +698,7 @@ void lru_add_drain_all(void) } /** - * release_pages - batched page_cache_release() + * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages * @cold: whether the pages are cache cold @@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + if (is_huge_zero_page(page)) { + put_huge_zero_page(); + continue; + } + page = compound_head(page); if (!put_page_testzero(page)) continue; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index b5f7f24b8dd1..310ac0b8f974 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) return 0; nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + pr_info("couldn't allocate enough memory for swap_cgroup\n"); + pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 69cb2464e7dc..366ce3518703 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,7 +85,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - page_cache_get(page); + get_page(page); SetPageSwapCache(page); set_page_private(page, entry.val); @@ -109,7 +109,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page, 0UL); ClearPageSwapCache(page); - page_cache_release(page); + put_page(page); } return error; @@ -226,7 +226,7 @@ void delete_from_swap_cache(struct page *page) spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry); - page_cache_release(page); + put_page(page); } /* @@ -252,7 +252,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - page_cache_release(page); + put_page(page); } /* @@ -380,7 +380,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } while (err != -ENOMEM); if (new_page) - page_cache_release(new_page); + put_page(new_page); return found_page; } @@ -495,7 +495,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; if (offset != entry_offset) SetPageReadahead(page); - page_cache_release(page); + put_page(page); } blk_finish_plug(&plug); diff --git a/mm/swapfile.c b/mm/swapfile.c index d2c37365e2d6..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,6 +48,12 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; +/* + * Some modules use swappable objects and may try to swap them out under + * memory pressure (via the shrinker). Before doing so, they may wish to + * check to see if any swap space is available. + */ +EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; @@ -113,7 +119,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) ret = try_to_free_swap(page); unlock_page(page); } - page_cache_release(page); + put_page(page); return ret; } @@ -916,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { @@ -994,7 +1001,7 @@ int free_swap_and_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), entry.val); if (page && !trylock_page(page)) { - page_cache_release(page); + put_page(page); page = NULL; } } @@ -1011,7 +1018,7 @@ int free_swap_and_cache(swp_entry_t entry) SetPageDirty(page); } unlock_page(page); - page_cache_release(page); + put_page(page); } return p != NULL; } @@ -1512,7 +1519,7 @@ int try_to_unuse(unsigned int type, bool frontswap, } if (retval) { unlock_page(page); - page_cache_release(page); + put_page(page); break; } @@ -1564,7 +1571,7 @@ int try_to_unuse(unsigned int type, bool frontswap, */ SetPageDirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); /* * Make sure that we aren't completely killing @@ -2526,8 +2533,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - pr_info("Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", @@ -2569,7 +2575,7 @@ bad_swap: out: if (page && !IS_ERR(page)) { kunmap(page); - page_cache_release(page); + put_page(page); } if (name) putname(name); diff --git a/mm/truncate.c b/mm/truncate.c index 7598b552ae03..b00272810871 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -118,7 +118,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) return -EIO; if (page_has_private(page)) - do_invalidatepage(page, 0, PAGE_CACHE_SIZE); + do_invalidatepage(page, 0, PAGE_SIZE); /* * Some filesystems seem to re-dirty the page even after @@ -159,8 +159,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)page->index << PAGE_SHIFT, + PAGE_SIZE, 0); } return truncate_complete_page(mapping, page); } @@ -241,8 +241,8 @@ void truncate_inode_pages_range(struct address_space *mapping, return; /* Offsets within partial pages */ - partial_start = lstart & (PAGE_CACHE_SIZE - 1); - partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); + partial_start = lstart & (PAGE_SIZE - 1); + partial_end = (lend + 1) & (PAGE_SIZE - 1); /* * 'start' and 'end' always covers the range of pages to be fully @@ -250,7 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping, * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ - start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' @@ -259,7 +259,7 @@ void truncate_inode_pages_range(struct address_space *mapping, */ end = -1; else - end = (lend + 1) >> PAGE_CACHE_SHIFT; + end = (lend + 1) >> PAGE_SHIFT; pagevec_init(&pvec, 0); index = start; @@ -298,7 +298,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { - unsigned int top = PAGE_CACHE_SIZE; + unsigned int top = PAGE_SIZE; if (start > end) { /* Truncation within a single page */ top = partial_end; @@ -311,7 +311,7 @@ void truncate_inode_pages_range(struct address_space *mapping, do_invalidatepage(page, partial_start, top - partial_start); unlock_page(page); - page_cache_release(page); + put_page(page); } } if (partial_end) { @@ -324,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping, do_invalidatepage(page, 0, partial_end); unlock_page(page); - page_cache_release(page); + put_page(page); } } /* @@ -538,7 +538,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); - page_cache_release(page); /* pagecache ref */ + put_page(page); /* pagecache ref */ return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -608,18 +608,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)index << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_SHIFT, (loff_t)(1 + end - index) - << PAGE_CACHE_SHIFT, - 0); + << PAGE_SHIFT, + 0); did_range_unmap = 1; } else { /* * Just zap this page */ unmap_mapping_range(mapping, - (loff_t)index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)index << PAGE_SHIFT, + PAGE_SIZE, 0); } } BUG_ON(page_mapped(page)); @@ -744,14 +744,14 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) WARN_ON(to > inode->i_size); - if (from >= to || bsize == PAGE_CACHE_SIZE) + if (from >= to || bsize == PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); - if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; - index = from >> PAGE_CACHE_SHIFT; + index = from >> PAGE_SHIFT; page = find_lock_page(inode->i_mapping, index); /* Page not cached? Nothing to do */ if (!page) @@ -763,7 +763,7 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) if (page_mkclean(page)) set_page_dirty(page); unlock_page(page); - page_cache_release(page); + put_page(page); } EXPORT_SYMBOL(pagecache_isize_extended); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 806b0c758c5b..af817e5060fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -93,7 +93,7 @@ out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); mem_cgroup_cancel_charge(page, memcg, false); out_release: - page_cache_release(page); + put_page(page); goto out; } @@ -230,8 +230,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, - dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } @@ -288,7 +287,7 @@ out_unlock: up_read(&dst_mm->mmap_sem); out: if (page) - page_cache_release(page); + put_page(page); BUG_ON(copied < 0); BUG_ON(err > 0); BUG_ON(!copied && !err); diff --git a/mm/util.c b/mm/util.c index 4fb14ca5a419..6cc81e7b8705 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; - return get_user_pages_unlocked(current, mm, start, nr_pages, - write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, write, 0, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -396,6 +394,13 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +int sysctl_overcommit_ratio __read_mostly = 50; +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -437,6 +442,123 @@ unsigned long vm_commit_limit(void) return allowed; } +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5bffe47..ae7d20b447ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -469,8 +469,8 @@ overflow: goto retry; } if (printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: " - "use vmalloc=<size> to increase size.\n", size); + pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", + size); kfree(va); return ERR_PTR(-EBUSY); } @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* @@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); + BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); diff --git a/mm/vmscan.c b/mm/vmscan.c index dd984470248f..142cb61f4822 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * * @memcg specifies the memory cgroup to target. If it is not NULL, * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise all shrinkers - * are called, and memcg aware shrinkers are supposed to scan the - * global list then. + * objects from the memory cgroup specified. Otherwise, only unaware + * shrinkers are called. * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example @@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_online(memcg)) + if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; if (nr_scanned == 0) @@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + /* + * If kernel memory accounting is disabled, we ignore + * SHRINKER_MEMCG_AWARE flag and call all shrinkers + * passing NULL for memcg. + */ + if (memcg_kmem_enabled() && + !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) @@ -633,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + page_ref_unfreeze(page, 2); goto cannot_free; } @@ -699,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); + page_ref_unfreeze(page, 1); return 1; } return 0; @@ -2548,7 +2553,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, - requested_highidx, sc->nodemask) { + gfp_zone(sc->gfp_mask), sc->nodemask) { enum zone_type classzone_idx; if (!populated_zone(zone)) @@ -2968,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3029,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3083,10 +3093,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, - struct scan_control *sc, - unsigned long *nr_attempted) + struct scan_control *sc) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3094,17 +3102,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3118,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; - clear_bit(ZONE_WRITEBACK, &zone->flags); /* @@ -3136,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3148,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3165,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3183,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -3220,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3236,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } - /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. @@ -3297,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc)) raise_priority = false; } @@ -3311,49 +3283,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3364,7 +3316,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { + /* + * Compaction records what page blocks it recently failed to + * isolate pages from and skips them in the future scanning. + * When kswapd is going to sleep, it is reasonable to assume + * that pages and compaction may succeed so reset the cache. + */ + reset_isolation_suitable(pgdat); + + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3374,7 +3341,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3387,14 +3355,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); - /* - * Compaction records what page blocks it recently failed to - * isolate pages from and skips them in the future scanning. - * When kswapd is going to sleep, it is reasonable to assume - * that pages and compaction may succeed so reset the cache. - */ - reset_isolation_suitable(pgdat); - if (!kthread_should_stop()) schedule(); @@ -3424,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3457,24 +3416,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3484,7 +3438,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3504,9 +3458,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3536,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); diff --git a/mm/vmstat.c b/mm/vmstat.c index 69ce64f7b8d7..5e4300482897 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE @@ -847,6 +848,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc_failed", "thp_split_page", "thp_split_page_failed", + "thp_deferred_split_page", "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", diff --git a/mm/workingset.c b/mm/workingset.c index 6130ba0b2641..8a75f8d2916a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_present_pages(sc->nid); + if (memcg_kmem_enabled()) + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + else + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); + /* * Active cache pages are limited to 50% of memory, and shadow * entries that represent a refault distance bigger than that @@ -458,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; /* diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2d7c4c11fc63..fe47fbba995a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -281,7 +281,6 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ - bool huge; }; static int create_handle_cache(struct zs_pool *pool) @@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void) debugfs_remove_recursive(zs_stat_root); } +static unsigned long zs_can_compact(struct size_class *class); + static int zs_stats_size_show(struct seq_file *s, void *v) { int i; @@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v) struct size_class *class; int objs_per_zspage; unsigned long class_almost_full, class_almost_empty; - unsigned long obj_allocated, obj_used, pages_used; + unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", "class", "size", "almost_full", "almost_empty", "obj_allocated", "obj_used", "pages_used", - "pages_per_zspage"); + "pages_per_zspage", "freeable"); for (i = 0; i < zs_size_classes; i++) { class = pool->size_class[i]; @@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); spin_unlock(&class->lock); objs_per_zspage = get_maxobj_per_zspage(class->size, @@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v) pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", i, class->size, class_almost_full, class_almost_empty, obj_allocated, obj_used, pages_used, - class->pages_per_zspage); + class->pages_per_zspage, freeable); total_class_almost_full += class_almost_full; total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; + total_freeable += freeable; } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", "Total", "", total_class_almost_full, total_class_almost_empty, total_objs, - total_used_objs, total_pages); + total_used_objs, total_pages, "", total_freeable); return 0; } @@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area, goto out; buf = area->vm_buf; - if (!area->huge) { - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - } + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; @@ -1732,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); - obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - - zs_stat_get(class, OBJ_USED); + if (obj_allocated <= obj_used) + return 0; + obj_wasted = obj_allocated - obj_used; obj_wasted /= get_maxobj_per_zspage(class->size, class->pages_per_zspage); diff --git a/mm/zswap.c b/mm/zswap.c index bf14508afd64..de0f119b1780 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -170,6 +170,8 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); +/* pool counter to provide unique names to zpool */ +static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; @@ -565,6 +567,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -573,7 +576,10 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } - pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + + pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; @@ -869,7 +875,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ - page_cache_release(page); + put_page(page); ret = -EEXIST; goto fail; @@ -897,7 +903,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) /* start writeback */ __swap_writepage(page, &wbc, end_swap_bio_write); - page_cache_release(page); + put_page(page); zswap_written_back_pages++; spin_lock(&tree->lock); |