summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug13
-rw-r--r--mm/Makefile17
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c244
-rw-r--r--mm/debug.c2
-rw-r--r--mm/debug_page_ref.c54
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/fadvise.c8
-rw-r--r--mm/filemap.c188
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c91
-rw-r--r--mm/huge_memory.c443
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/internal.h16
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c162
-rw-r--r--mm/kasan/kasan.h37
-rw-r--r--mm/kasan/report.c68
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/kmemleak-test.c2
-rw-r--r--mm/kmemleak.c32
-rw-r--r--mm/ksm.c27
-rw-r--r--mm/madvise.c6
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c228
-rw-r--r--mm/memory-failure.c64
-rw-r--r--mm/memory.c178
-rw-r--r--mm/memory_hotplug.c54
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c36
-rw-r--r--mm/migrate.c41
-rw-r--r--mm/mincore.c8
-rw-r--r--mm/mm_init.c7
-rw-r--r--mm/mmap.c143
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c4
-rw-r--r--mm/nommu.c149
-rw-r--r--mm/oom_kill.c216
-rw-r--r--mm/page-writeback.c18
-rw-r--r--mm/page_alloc.c240
-rw-r--r--mm/page_io.c117
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/percpu-km.c6
-rw-r--r--mm/percpu.c43
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/process_vm_access.c11
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/readahead.c20
-rw-r--r--mm/rmap.c101
-rw-r--r--mm/shmem.c179
-rw-r--r--mm/slab.c131
-rw-r--r--mm/slab.h32
-rw-r--r--mm/slab_common.c20
-rw-r--r--mm/slub.c46
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap.c19
-rw-r--r--mm/swap_cgroup.c5
-rw-r--r--mm/swap_state.c12
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/truncate.c40
-rw-r--r--mm/userfaultfd.c7
-rw-r--r--mm/util.c128
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c187
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/zsmalloc.c36
-rw-r--r--mm/zswap.c12
74 files changed, 2476 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 03cbfa072f42..989f8f3d77e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,7 +187,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
- default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
depends on X86_64 #arch_add_memory() comprehends device memory
help
@@ -669,3 +667,8 @@ config ZONE_DEVICE
config FRAME_VECTOR
bool
+
+config ARCH_USES_HIGH_VMA_FLAGS
+ bool
+config ARCH_HAS_PKEYS
+ bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5c50b238b770..22f4cd96acb0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -79,3 +79,16 @@ config PAGE_POISONING_ZERO
Enabling page poisoning with this option will disable hibernation
If unsure, say N
+ bool
+
+config DEBUG_PAGE_REF
+ bool "Enable tracepoint to track down page reference manipulation"
+ depends on DEBUG_KERNEL
+ depends on TRACEPOINTS
+ ---help---
+ This is a feature to add tracepoint for tracking down page reference
+ manipulation. This tracking is useful to diagnose functional failure
+ due to migration failures caused by page reference mismatches. Be
+ careful when enabling this feature because it adds about 30 KB to the
+ kernel code. However the runtime performance overhead is virtually
+ nil until the tracepoints are actually enabled.
diff --git a/mm/Makefile b/mm/Makefile
index cfdd481d27a5..deb467edca2d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,8 +3,24 @@
#
KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -81,3 +97,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c554d173a65f..0c6317b7db38 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -898,7 +898,7 @@ static atomic_t nr_wb_congested[2];
void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
- enum wb_state bit;
+ enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (test_and_clear_bit(bit, &congested->state))
@@ -911,7 +911,7 @@ EXPORT_SYMBOL(clear_wb_congested);
void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum wb_state bit;
+ enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
if (!test_and_set_bit(bit, &congested->state))
@@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
return -EFAULT;
- printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
- table->procname);
+ pr_warn_once("%s exported in /proc is scheduled for removal\n",
+ table->procname);
*lenp = 2;
*ppos += *lenp;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 91e32bc8517f..0aa7dda52402 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
#define bdebug(fmt, args...) ({ \
if (unlikely(bootmem_debug)) \
- printk(KERN_INFO \
- "bootmem::%s " fmt, \
+ pr_info("bootmem::%s " fmt, \
__func__, ## args); \
})
@@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 93f71d968098..8fa254043801 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,8 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -849,16 +852,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
ISOLATE_UNEVICTABLE);
- /*
- * In case of fatal failure, release everything that might
- * have been isolated in the previous iteration, and signal
- * the failure back to caller.
- */
- if (!pfn) {
- putback_movable_pages(&cc->migratepages);
- cc->nr_migratepages = 0;
+ if (!pfn)
break;
- }
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
break;
@@ -1188,11 +1183,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1335,10 +1330,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1474,6 +1468,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1736,4 +1731,225 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ if (kthread_should_stop())
+ return;
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug.c b/mm/debug.c
index df7247b0b532..8865bfb41b0b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -43,7 +43,7 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
- page, atomic_read(&page->_count), page_mapcount(page),
+ page, page_ref_count(page), page_mapcount(page),
page->mapping, page->index);
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c
new file mode 100644
index 000000000000..1aef3d562e52
--- /dev/null
+++ b/mm/debug_page_ref.c
@@ -0,0 +1,54 @@
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_ref.h>
+
+void __page_ref_set(struct page *page, int v)
+{
+ trace_page_ref_set(page, v);
+}
+EXPORT_SYMBOL(__page_ref_set);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_set);
+
+void __page_ref_mod(struct page *page, int v)
+{
+ trace_page_ref_mod(page, v);
+}
+EXPORT_SYMBOL(__page_ref_mod);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod);
+
+void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_test(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_test);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test);
+
+void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_return(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_return);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return);
+
+void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+ trace_page_ref_mod_unless(page, v, u);
+}
+EXPORT_SYMBOL(__page_ref_mod_unless);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless);
+
+void __page_ref_freeze(struct page *page, int v, int ret)
+{
+ trace_page_ref_freeze(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_freeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze);
+
+void __page_ref_unfreeze(struct page *page, int v)
+{
+ trace_page_ref_unfreeze(page, v);
+}
+EXPORT_SYMBOL(__page_ref_unfreeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 57312b5d6e12..abcbfe86c25a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool)
"dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
else
- printk(KERN_ERR
- "dma_pool_destroy %s, %p busy\n",
+ pr_err("dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
return;
}
@@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
else
- printk(KERN_ERR
- "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
return;
}
@@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ pr_err("dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b8a5bc66b0c0..b8024fa7101d 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -97,8 +97,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
- start_index = offset >> PAGE_CACHE_SHIFT;
- end_index = endbyte >> PAGE_CACHE_SHIFT;
+ start_index = offset >> PAGE_SHIFT;
+ end_index = endbyte >> PAGE_SHIFT;
/* Careful about overflow on the "+1" */
nrpages = end_index - start_index + 1;
@@ -124,8 +124,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
* preserved on the expectation that it is better to preserve
* needed memory than to discard unneeded memory.
*/
- start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
- end_index = (endbyte >> PAGE_CACHE_SHIFT);
+ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
+ end_index = (endbyte >> PAGE_SHIFT);
if (end_index >= start_index) {
unsigned long count = invalidate_mapping_pages(mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 61b441b191ad..f2479af09da9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -265,7 +265,7 @@ void delete_from_page_cache(struct page *page)
if (freepage)
freepage(page);
- page_cache_release(page);
+ put_page(page);
}
EXPORT_SYMBOL(delete_from_page_cache);
@@ -352,8 +352,8 @@ EXPORT_SYMBOL(filemap_flush);
static int __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
- pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
+ pgoff_t index = start_byte >> PAGE_SHIFT;
+ pgoff_t end = end_byte >> PAGE_SHIFT;
struct pagevec pvec;
int nr_pages;
int ret = 0;
@@ -550,7 +550,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
pgoff_t offset = old->index;
freepage = mapping->a_ops->freepage;
- page_cache_get(new);
+ get_page(new);
new->mapping = mapping;
new->index = offset;
@@ -572,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
radix_tree_preload_end();
if (freepage)
freepage(old);
- page_cache_release(old);
+ put_page(old);
}
return error;
@@ -586,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -651,7 +651,7 @@ static int __add_to_page_cache_locked(struct page *page,
return error;
}
- page_cache_get(page);
+ get_page(page);
page->mapping = mapping;
page->index = offset;
@@ -675,7 +675,7 @@ err_insert:
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -1083,7 +1083,7 @@ repeat:
* include/linux/pagemap.h for details.
*/
if (unlikely(page != *pagep)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
}
@@ -1121,7 +1121,7 @@ repeat:
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1168,7 +1168,7 @@ repeat:
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
if (!trylock_page(page)) {
- page_cache_release(page);
+ put_page(page);
return NULL;
}
} else {
@@ -1178,7 +1178,7 @@ repeat:
/* Has the page been truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1209,7 +1209,7 @@ no_page:
err = add_to_page_cache_lru(page, mapping, offset,
gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
if (err == -EEXIST)
goto repeat;
@@ -1255,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1263,8 +1262,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1277,7 +1278,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
export:
@@ -1317,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1327,13 +1327,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1348,7 +1343,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1384,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1395,12 +1389,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1415,7 +1405,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1425,7 +1415,7 @@ repeat:
* negatives, which is just confusing to the caller.
*/
if (page->mapping == NULL || page->index != iter.index) {
- page_cache_release(page);
+ put_page(page);
break;
}
@@ -1460,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1471,12 +1460,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1497,7 +1482,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -1539,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1549,12 +1533,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -1569,7 +1549,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
export:
@@ -1630,11 +1610,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
unsigned int prev_offset;
int error = 0;
- index = *ppos >> PAGE_CACHE_SHIFT;
- prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
- prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_SHIFT;
+ prev_index = ra->prev_pos >> PAGE_SHIFT;
+ prev_offset = ra->prev_pos & (PAGE_SIZE-1);
+ last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
for (;;) {
struct page *page;
@@ -1668,7 +1648,7 @@ find_page:
if (PageUptodate(page))
goto page_ok;
- if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+ if (inode->i_blkbits == PAGE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page))
@@ -1692,18 +1672,18 @@ page_ok:
*/
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index)) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
+ nr = PAGE_SIZE;
if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ nr = ((isize - 1) & ~PAGE_MASK) + 1;
if (nr <= offset) {
- page_cache_release(page);
+ put_page(page);
goto out;
}
}
@@ -1731,11 +1711,11 @@ page_ok:
ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
prev_offset = offset;
- page_cache_release(page);
+ put_page(page);
written += ret;
if (!iov_iter_count(iter))
goto out;
@@ -1755,7 +1735,7 @@ page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
continue;
}
@@ -1777,7 +1757,7 @@ readpage:
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
- page_cache_release(page);
+ put_page(page);
error = 0;
goto find_page;
}
@@ -1794,7 +1774,7 @@ readpage:
* invalidate_mapping_pages got it
*/
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto find_page;
}
unlock_page(page);
@@ -1809,7 +1789,7 @@ readpage:
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
- page_cache_release(page);
+ put_page(page);
goto out;
no_cached_page:
@@ -1825,7 +1805,7 @@ no_cached_page:
error = add_to_page_cache_lru(page, mapping, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
- page_cache_release(page);
+ put_page(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
@@ -1837,10 +1817,10 @@ no_cached_page:
out:
ra->prev_pos = prev_index;
- ra->prev_pos <<= PAGE_CACHE_SHIFT;
+ ra->prev_pos <<= PAGE_SHIFT;
ra->prev_pos |= prev_offset;
- *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
file_accessed(filp);
return written ? written : error;
}
@@ -1860,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+
+ if (!count)
+ goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
loff_t size;
- if (!count)
- goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
@@ -1931,7 +1912,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
else if (ret == -EEXIST)
ret = 0; /* losing race to add is OK */
- page_cache_release(page);
+ put_page(page);
} while (ret == AOP_TRUNCATED_PAGE);
@@ -2041,8 +2022,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
loff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (offset >= size >> PAGE_CACHE_SHIFT)
+ size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (offset >= size >> PAGE_SHIFT)
return VM_FAULT_SIGBUS;
/*
@@ -2068,7 +2049,7 @@ retry_find:
}
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
- page_cache_release(page);
+ put_page(page);
return ret | VM_FAULT_RETRY;
}
@@ -2091,10 +2072,10 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
- if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
+ size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= size >> PAGE_SHIFT)) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return VM_FAULT_SIGBUS;
}
@@ -2139,7 +2120,7 @@ page_not_uptodate:
if (!PageUptodate(page))
error = -EIO;
}
- page_cache_release(page);
+ put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
@@ -2171,10 +2152,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
@@ -2182,7 +2164,7 @@ repeat:
/* Has the page moved? */
if (unlikely(page != *slot)) {
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -2196,8 +2178,8 @@ repeat:
if (page->mapping != mapping || !PageUptodate(page))
goto unlock;
- size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
- if (page->index >= size >> PAGE_CACHE_SHIFT)
+ size = round_up(i_size_read(mapping->host), PAGE_SIZE);
+ if (page->index >= size >> PAGE_SHIFT)
goto unlock;
pte = vmf->pte + page->index - vmf->pgoff;
@@ -2213,7 +2195,7 @@ repeat:
unlock:
unlock_page(page);
skip:
- page_cache_release(page);
+ put_page(page);
next:
if (iter.index == vmf->max_pgoff)
break;
@@ -2296,7 +2278,7 @@ static struct page *wait_on_page_read(struct page *page)
if (!IS_ERR(page)) {
wait_on_page_locked(page);
if (!PageUptodate(page)) {
- page_cache_release(page);
+ put_page(page);
page = ERR_PTR(-EIO);
}
}
@@ -2319,7 +2301,7 @@ repeat:
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
if (unlikely(err)) {
- page_cache_release(page);
+ put_page(page);
if (err == -EEXIST)
goto repeat;
/* Presumably ENOMEM for radix tree node */
@@ -2329,7 +2311,7 @@ repeat:
filler:
err = filler(data, page);
if (err < 0) {
- page_cache_release(page);
+ put_page(page);
return ERR_PTR(err);
}
@@ -2382,7 +2364,7 @@ filler:
/* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
goto repeat;
}
@@ -2529,7 +2511,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
struct iov_iter data;
write_len = iov_iter_count(from);
- end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+ end = (pos + write_len - 1) >> PAGE_SHIFT;
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
if (written)
@@ -2543,7 +2525,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
if (mapping->nrpages) {
written = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
/*
* If a page can not be invalidated, return 0 to fall back
* to buffered write.
@@ -2568,7 +2550,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
*/
if (mapping->nrpages) {
invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
+ pos >> PAGE_SHIFT, end);
}
if (written > 0) {
@@ -2629,8 +2611,8 @@ ssize_t generic_perform_write(struct file *file,
size_t copied; /* Bytes copied from user */
void *fsdata;
- offset = (pos & (PAGE_CACHE_SIZE - 1));
- bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ offset = (pos & (PAGE_SIZE - 1));
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
@@ -2683,7 +2665,7 @@ again:
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
- bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
@@ -2770,8 +2752,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
- pos >> PAGE_CACHE_SHIFT,
- endbyte >> PAGE_CACHE_SHIFT);
+ pos >> PAGE_SHIFT,
+ endbyte >> PAGE_SHIFT);
} else {
/*
* We don't know how much we wrote, so just return
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 7cf2b7163222..381bb07ed14f 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
- ret = get_user_pages_locked(current, mm, start, nr_frames,
+ ret = get_user_pages_locked(start, nr_frames,
write, force, (struct page **)(vec->ptrs), &locked);
goto out;
}
diff --git a/mm/gup.c b/mm/gup.c
index 7bf19ffa2199..c057784c8444 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -14,6 +14,7 @@
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -363,6 +364,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ if (*flags & FOLL_REMOTE)
+ fault_flags |= FAULT_FLAG_REMOTE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
@@ -413,11 +416,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
+ int write = (gup_flags & FOLL_WRITE);
+ int foreign = (gup_flags & FOLL_REMOTE);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_WRITE) {
+ if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -443,6 +448,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_MAYREAD))
return -EFAULT;
}
+ /*
+ * gups are always data accesses, not instruction
+ * fetches, so execute=false here
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return -EFAULT;
return 0;
}
@@ -609,6 +620,28 @@ next_page:
}
EXPORT_SYMBOL(__get_user_pages);
+bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+{
+ bool write = !!(fault_flags & FAULT_FLAG_WRITE);
+ bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
+ vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
+
+ if (!(vm_flags & vma->vm_flags))
+ return false;
+
+ /*
+ * The architecture might have a hardware protection
+ * mechanism other than read/write that can deny access.
+ *
+ * gup always represents data access, not instruction
+ * fetches, so execute=false here:
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return false;
+
+ return true;
+}
+
/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
@@ -644,7 +677,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- vm_flags_t vm_flags;
int ret, major = 0;
if (unlocked)
@@ -655,8 +687,7 @@ retry:
if (!vma || address < vma->vm_start)
return -EFAULT;
- vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
- if (!(vm_flags & vma->vm_flags))
+ if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
@@ -807,13 +838,13 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* if (locked)
* up_read(&mm->mmap_sem);
*/
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, NULL, locked, true, FOLL_TOUCH);
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, NULL, locked, true,
+ FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
@@ -860,17 +891,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* or if "force" shall be set to 1 (get_user_pages_fast misses the
* "force" parameter).
*/
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, FOLL_TOUCH);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
/*
- * get_user_pages() - pin user pages in memory
+ * get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
@@ -924,12 +954,30 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, vmas, NULL, false, FOLL_TOUCH);
+ pages, vmas, NULL, false,
+ FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's. We also
+ * obviously don't pass FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, vmas, NULL, false,
+ FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1058,7 +1106,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* @addr: user address
*
* Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by page_cache_release() or put_page().
+ * to be freed afterwards by put_page().
*
* Returns NULL on any kind of failure - a hole must then be inserted into
* the corefile, to preserve alignment with its headers; and also returns
@@ -1144,6 +1192,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
+ if (!arch_pte_access_permitted(pte, write))
+ goto pte_unmap;
+
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = compound_head(page);
@@ -1439,7 +1490,6 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- struct mm_struct *mm = current->mm;
int nr, ret;
start &= PAGE_MASK;
@@ -1451,8 +1501,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(current, mm, start,
- nr_pages - nr, write, 0, pages);
+ ret = get_user_pages_unlocked(start, nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1ea21e203a70..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -78,12 +78,12 @@ unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
@@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* it would have happened if the vma was large enough during page
* fault.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void)
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
@@ -233,7 +232,7 @@ retry:
return READ_ONCE(huge_zero_page);
}
-static void put_huge_zero_page(void)
+void put_huge_zero_page(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = {
#ifdef CONFIG_SYSFS
-static ssize_t double_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag req_madv)
-{
- if (test_bit(enabled, &transparent_hugepage_flags)) {
- VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
- return sprintf(buf, "[always] madvise never\n");
- } else if (test_bit(req_madv, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
- else
- return sprintf(buf, "always madvise [never]\n");
-}
-static ssize_t double_flag_store(struct kobject *kobj,
+static ssize_t triple_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag deferred,
enum transparent_hugepage_flag req_madv)
{
- if (!memcmp("always", buf,
+ if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ if (enabled == deferred)
+ return -EINVAL;
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(deferred, &transparent_hugepage_flags);
+ } else if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
- set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(enabled, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
set_bit(req_madv, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
} else
return -EINVAL;
@@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj,
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
}
+
static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
ssize_t ret;
- ret = double_flag_store(kobj, attr, buf, count,
+ ret = triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
@@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] defer madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [defer] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [madvise] never\n");
+ else
+ return sprintf(buf, "always defer madvise [never]\n");
+
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return double_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ return triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
}
static struct kobj_attribute defrag_attr =
@@ -660,6 +669,18 @@ static int __init hugepage_init(void)
return -EINVAL;
}
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ /*
+ * hugepages can't be allocated by the buddy allocator
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ /*
+ * we use page->mapping and page->index in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
@@ -764,7 +785,6 @@ void prep_transhuge_page(struct page *page)
* we use page->mapping and page->indexlru in second tail page
* as list_head: assuming THP order >= 2
*/
- BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
@@ -843,9 +863,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return 0;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+/*
+ * If THP is set to always then directly reclaim/compact as necessary
+ * If set to defer then do no reclaim and defer to khugepaged
+ * If set to madvise and the VMA is flagged then directly reclaim/compact
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+{
+ gfp_t reclaim_flags = 0;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
+ (vma->vm_flags & VM_HUGEPAGE))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_KSWAPD_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+
+ return GFP_TRANSHUGE | reclaim_flags;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
+ return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
}
/* Caller must hold page table lock. */
@@ -919,7 +960,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
}
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ gfp = alloc_hugepage_direct_gfpmask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
@@ -1257,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
/*
* We can only reuse the page if nobody else maps the huge page or it's
- * part. We can do it by checking page_mapcount() on each sub-page, but
- * it's expensive.
- * The cheaper way is to check page_count() to be equal 1: every
- * mapcount takes page reference reference, so this way we can
- * guarantee, that the PMD is the only mapping.
- * This can give false negative if somebody pinned the page, but that's
- * fine.
+ * part.
*/
- if (page_mapcount(page) == 1 && page_count(page) == 1) {
+ if (page_trans_huge_mapcount(page, NULL) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1279,7 +1314,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -1643,12 +1678,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (vma_is_dax(vma)) {
spin_unlock(ptl);
if (is_huge_zero_pmd(orig_pmd))
- put_huge_zero_page();
+ tlb_remove_page(tlb, pmd_page(orig_pmd));
} else if (is_huge_zero_pmd(orig_pmd)) {
pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
atomic_long_dec(&tlb->mm->nr_ptes);
spin_unlock(ptl);
- put_huge_zero_page();
+ tlb_remove_page(tlb, pmd_page(orig_pmd));
} else {
struct page *page = pmd_page(orig_pmd);
page_remove_rmap(page, true);
@@ -1919,10 +1954,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
* page fault if needed.
*/
return 0;
- if (vma->vm_ops)
+ if (vma->vm_ops || (vm_flags & VM_NO_THP))
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -2039,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (pte_write(pteval)) {
writable = true;
} else {
- if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ if (PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
unlock_page(page);
result = SCAN_SWAP_CACHE_PAGE;
goto out;
@@ -2249,11 +2284,12 @@ static int khugepaged_find_target_node(void)
return 0;
}
-static inline struct page *alloc_hugepage(int defrag)
+static inline struct page *alloc_khugepaged_hugepage(void)
{
struct page *page;
- page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
if (page)
prep_transhuge_page(page);
return page;
@@ -2264,7 +2300,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
struct page *hpage;
do {
- hpage = alloc_hugepage(khugepaged_defrag());
+ hpage = alloc_khugepaged_hugepage();
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
@@ -2310,8 +2346,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return false;
if (is_vma_temporary_stack(vma))
return false;
- VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
- return true;
+ return !(vma->vm_flags & VM_NO_THP);
}
static void collapse_huge_page(struct mm_struct *mm,
@@ -2335,8 +2370,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
- __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
@@ -2537,7 +2571,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
khugepaged_node_load[node]++;
if (!PageLRU(page)) {
- result = SCAN_SCAN_ABORT;
+ result = SCAN_PAGE_LRU;
goto out_unmap;
}
if (PageLocked(page)) {
@@ -2857,7 +2891,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
@@ -2947,44 +2981,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ struct page *page = pmd_page(*pmd);
if (PageMlocked(page))
- get_page(page);
- else
- page = NULL;
+ clear_page_mlock(page);
} else if (!pmd_devmap(*pmd))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ __split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
- if (page) {
- lock_page(page);
- munlock_vma_page(page);
- unlock_page(page);
- put_page(page);
- }
}
-static void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -2996,11 +3019,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page && page != pmd_page(*pmd))
+ return;
+
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_pmd(vma, pmd, address);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3016,7 +3048,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start);
+ split_huge_pmd_address(vma, start, false, NULL);
/*
* If the new end address isn't hpage aligned and it could
@@ -3026,7 +3058,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end);
+ split_huge_pmd_address(vma, end, false, NULL);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3040,184 +3072,36 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart);
+ split_huge_pmd_address(next, nstart, false, NULL);
}
}
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static void freeze_page(struct page *page)
{
- unsigned long haddr = address & HPAGE_PMD_MASK;
- spinlock_t *ptl;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
- pmd = pmd_offset(pud, address);
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_present(*pmd)) {
- spin_unlock(ptl);
- return;
- }
- if (pmd_trans_huge(*pmd)) {
- if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, haddr, true);
- spin_unlock(ptl);
- return;
- }
- spin_unlock(ptl);
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- pte_t entry, swp_pte;
- swp_entry_t swp_entry;
-
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non PMD-aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!pte_present(*pte))
- continue;
- if (page_to_pfn(page) != pte_pfn(*pte))
- continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte);
- if (pte_dirty(entry))
- SetPageDirty(page);
- swp_entry = make_migration_entry(page, pte_write(entry));
- swp_pte = swp_entry_to_pte(swp_entry);
- if (pte_soft_dirty(entry))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte, swp_pte);
- page_remove_rmap(page, false);
- put_page(page);
- }
- pte_unmap_unlock(pte - 1, ptl);
-}
-
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
-{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
+ enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+ TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
- pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
-
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- spinlock_t *ptl;
- pmd_t *pmd;
- pte_t *pte, entry;
- swp_entry_t swp_entry;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non-PMD aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!is_swap_pte(*pte))
- continue;
-
- swp_entry = pte_to_swp_entry(*pte);
- if (!is_migration_entry(swp_entry))
- continue;
- if (migration_entry_to_page(swp_entry) != page)
- continue;
-
- get_page(page);
- page_add_anon_rmap(page, vma, address, false);
-
- entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (PageDirty(page))
- entry = pte_mkdirty(entry);
- if (is_write_migration_entry(swp_entry))
- entry = maybe_mkwrite(entry, vma);
-
- flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ /* We only need TTU_SPLIT_HUGE_PMD once */
+ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+ for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+ /* Cut short if the page is unmapped */
+ if (page_count(page) == 1)
+ return;
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ ret = try_to_unmap(page + i, ttu_flags);
}
- pte_unmap_unlock(pte - 1, ptl);
+ VM_BUG_ON(ret);
}
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
-
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff, pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
+ int i;
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- unfreeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
}
static void __split_huge_page_tail(struct page *head, int tail,
@@ -3226,7 +3110,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
struct page *page_tail = head + tail;
VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
@@ -3239,7 +3123,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc().
*/
- atomic_inc(&page_tail->_count);
+ page_ref_inc(page_tail);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3295,7 +3179,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
- unfreeze_page(page_anon_vma(head), head);
+ unfreeze_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -3334,6 +3218,64 @@ int total_mapcount(struct page *page)
}
/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+ int i, ret, _total_mapcount, mapcount;
+
+ /* hugetlbfs shouldn't call it */
+ VM_BUG_ON_PAGE(PageHuge(page), page);
+
+ if (likely(!PageTransCompound(page))) {
+ mapcount = atomic_read(&page->_mapcount) + 1;
+ if (total_mapcount)
+ *total_mapcount = mapcount;
+ return mapcount;
+ }
+
+ page = compound_head(page);
+
+ _total_mapcount = ret = 0;
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mapcount = atomic_read(&page[i]._mapcount) + 1;
+ ret = max(ret, mapcount);
+ _total_mapcount += mapcount;
+ }
+ if (PageDoubleMap(page)) {
+ ret -= 1;
+ _total_mapcount -= HPAGE_PMD_NR;
+ }
+ mapcount = compound_mapcount(page);
+ ret += mapcount;
+ _total_mapcount += mapcount;
+ if (total_mapcount)
+ *total_mapcount = _total_mapcount;
+ return ret;
+}
+
+/*
* This function splits huge page into normal pages. @page can point to any
* subpage of huge page to split. Split doesn't change the position of @page.
*
@@ -3391,7 +3333,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
mlocked = PageMlocked(page);
- freeze_page(anon_vma, head);
+ freeze_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3420,7 +3362,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
BUG();
} else {
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- unfreeze_page(anon_vma, head);
+ unfreeze_page(head);
ret = -EBUSY;
}
@@ -3455,6 +3397,7 @@ void deferred_split_huge_page(struct page *page)
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
@@ -3562,7 +3505,7 @@ next:
}
}
- pr_info("%lu of %lu THP split", split, total);
+ pr_info("%lu of %lu THP split\n", split, total);
return 0;
}
@@ -3573,7 +3516,7 @@ static int __init split_huge_pages_debugfs(void)
{
void *ret;
- ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+ ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
&split_huge_pages_fops);
if (!ret)
pr_warn("Failed to create split_huge_pages in debugfs");
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aefba5a9cc47..19d0d08b396f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warning("hugepagesz= specified twice, ignoring\n");
+ pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warning("hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
return 1;
}
@@ -3347,7 +3346,7 @@ retry_avoidcopy:
old_page != pagecache_page)
outside_reserve = 1;
- page_cache_get(old_page);
+ get_page(old_page);
/*
* Drop page table lock as buddy allocator may be called. It will
@@ -3365,7 +3364,7 @@ retry_avoidcopy:
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
- page_cache_release(old_page);
+ put_page(old_page);
BUG_ON(huge_pte_none(pte));
unmap_ref_private(mm, vma, old_page, address);
BUG_ON(huge_pte_none(pte));
@@ -3426,9 +3425,9 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out_release_all:
- page_cache_release(new_page);
+ put_page(new_page);
out_release_old:
- page_cache_release(old_page);
+ put_page(old_page);
spin_lock(ptl); /* Caller expects lock to be held */
return ret;
diff --git a/mm/internal.h b/mm/internal.h
index ad9400d759c8..b79abb6721cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,10 +38,10 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline void set_page_count(struct page *page, int v)
-{
- atomic_set(&page->_count, v);
-}
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
extern int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
@@ -64,7 +64,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+ VM_BUG_ON_PAGE(page_ref_count(page), page);
set_page_count(page, 1);
}
@@ -148,9 +148,6 @@ extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
-#ifdef CONFIG_MEMORY_FAILURE
-extern bool is_free_buddy_page(struct page *page);
-#endif
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -175,6 +172,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -393,7 +391,7 @@ extern int mminit_loglevel;
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
- printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ pr_warn("mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f5b0..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
KASAN_SANITIZE := n
UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8c91..38f1dd79acdb 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
+#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
@@ -32,7 +34,6 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
-#include <linux/kasan.h>
#include "kasan.h"
#include "../slab.h"
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
KASAN_FREE_PAGE);
}
+#ifdef CONFIG_SLAB
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+ int rz =
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+ return rz;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+ unsigned long *flags)
+{
+ int redzone_adjust;
+ /* Make sure the adjusted size is still less than
+ * KMALLOC_MAX_CACHE_SIZE.
+ * TODO: this check is only useful for SLAB, but not SLUB. We'll need
+ * to skip it for SLUB when it starts using kasan_cache_create().
+ */
+ if (*size > KMALLOC_MAX_CACHE_SIZE -
+ sizeof(struct kasan_alloc_meta) -
+ sizeof(struct kasan_free_meta))
+ return;
+ *flags |= SLAB_KASAN;
+ /* Add alloc meta. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /* Add free meta. */
+ if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+ }
+ redzone_adjust = optimal_redzone(cache->object_size) -
+ (*size - cache->object_size);
+ if (redzone_adjust > 0)
+ *size += redzone_adjust;
+ *size = min(KMALLOC_MAX_CACHE_SIZE,
+ max(*size,
+ cache->object_size +
+ optimal_redzone(cache->object_size)));
+}
+#endif
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_INIT;
+ }
+#endif
}
-void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+#ifdef CONFIG_SLAB
+static inline int in_irqentry_text(unsigned long ptr)
{
- kasan_kmalloc(cache, object, cache->object_size);
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+ int i;
+
+ if (!trace->nr_entries)
+ return;
+ for (i = 0; i < trace->nr_entries; i++)
+ if (in_irqentry_text(trace->entries[i])) {
+ /* Include the irqentry function into the stack. */
+ trace->nr_entries = i + 1;
+ break;
+ }
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = KASAN_STACK_DEPTH,
+ .skip = 0
+ };
+
+ save_stack_trace(&trace);
+ filter_irq_stacks(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = save_stack(flags);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+#endif
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ kasan_kmalloc(cache, object, cache->object_size, flags);
}
void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_free_meta *free_info =
+ get_free_info(cache, object);
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_FREE;
+ set_track(&free_info->track, GFP_NOWAIT);
+ }
+#endif
+
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+ gfp_t flags)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+
+ alloc_info->state = KASAN_STATE_ALLOC;
+ alloc_info->alloc_size = size;
+ set_track(&alloc_info->track, flags);
+ }
+#endif
}
EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size)
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
{
struct page *page;
unsigned long redzone_start;
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
KASAN_PAGE_REDZONE);
}
-void kasan_krealloc(const void *object, size_t size)
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size)
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- kasan_kmalloc_large(object, size);
+ kasan_kmalloc_large(object, size, flags);
else
- kasan_kmalloc(page->slab_cache, object, size);
+ kasan_kmalloc(page->slab_cache, object, size, flags);
}
void kasan_kfree(void *ptr)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4f6c62e5c21e..30a2f0ba0e09 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/stackdepot.h>
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -54,6 +55,42 @@ struct kasan_global {
#endif
};
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+enum kasan_state {
+ KASAN_STATE_INIT,
+ KASAN_STATE_ALLOC,
+ KASAN_STATE_FREE
+};
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
+struct kasan_alloc_meta {
+ struct kasan_track track;
+ u32 state : 2; /* enum kasan_state */
+ u32 alloc_size : 30;
+ u32 reserved;
+};
+
+struct kasan_free_meta {
+ /* Allocator freelist pointer, unused by KASAN. */
+ void **freelist;
+ struct kasan_track track;
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object);
+
+
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 12f222d0224b..60869a5a0124 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -18,6 +18,7 @@
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stackdepot.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+#ifdef CONFIG_SLAB
+static void print_track(struct kasan_track *track)
+{
+ pr_err("PID = %u\n", track->pid);
+ if (track->stack) {
+ struct stack_trace trace;
+
+ depot_fetch_stack(track->stack, &trace);
+ print_stack_trace(&trace, 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+}
+
+static void object_err(struct kmem_cache *cache, struct page *page,
+ void *object, char *unused_reason)
+{
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ struct kasan_free_meta *free_info;
+
+ dump_stack();
+ pr_err("Object at %p, in cache %s\n", object, cache->name);
+ if (!(cache->flags & SLAB_KASAN))
+ return;
+ switch (alloc_info->state) {
+ case KASAN_STATE_INIT:
+ pr_err("Object not allocated yet\n");
+ break;
+ case KASAN_STATE_ALLOC:
+ pr_err("Object allocated with size %u bytes.\n",
+ alloc_info->alloc_size);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ break;
+ case KASAN_STATE_FREE:
+ pr_err("Object freed, allocated with size %u bytes\n",
+ alloc_info->alloc_size);
+ free_info = get_free_info(cache, object);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ pr_err("Deallocation:\n");
+ print_track(&free_info->track);
+ break;
+ }
+}
+#endif
+
static void print_address_description(struct kasan_access_info *info)
{
const void *addr = info->access_addr;
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info)
if (PageSlab(page)) {
void *object;
struct kmem_cache *cache = page->slab_cache;
- void *last_object;
-
- object = virt_to_obj(cache, page_address(page), addr);
- last_object = page_address(page) +
- page->objects * cache->size;
-
- if (unlikely(object > last_object))
- object = last_object; /* we hit into padding */
-
+ object = nearest_obj(cache, page,
+ (void *)info->access_addr);
object_err(cache, page, object,
- "kasan: bad access detected");
+ "kasan: bad access detected");
return;
}
dump_page(page, "kasan: bad access detected");
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info)
if (!init_task_stack_addr(addr))
pr_err("Address belongs to variable %pS\n", addr);
}
-
dump_stack();
}
@@ -214,8 +254,7 @@ static void kasan_report_error(struct kasan_access_info *info)
*/
kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
if (info->access_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -236,8 +275,7 @@ static void kasan_report_error(struct kasan_access_info *info)
print_address_description(info);
print_shadow_for_address(info->first_bad_addr);
}
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
kasan_enable_current();
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 6f4f424037c0..5bf191756a4a 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
if (!shadow) {
if (printk_ratelimit())
- printk(KERN_ERR "kmemcheck: failed to allocate "
- "shadow bitmap\n");
+ pr_err("kmemcheck: failed to allocate shadow bitmap\n");
return;
}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dcdcadb69533..dd3c23a801b1 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void)
struct test_node *elem;
int i;
- printk(KERN_INFO "Kmemleak testing\n");
+ pr_info("Kmemleak testing\n");
/* make some orphan objects */
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 25c0ad36fe38..e6429926e957 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -276,7 +276,7 @@ static void kmemleak_disable(void);
* Print a warning and dump the stack trace.
*/
#define kmemleak_warn(x...) do { \
- pr_warning(x); \
+ pr_warn(x); \
dump_stack(); \
kmemleak_warning = 1; \
} while (0)
@@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- pr_warning("Cannot allocate a kmemleak_object structure\n");
+ pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
return NULL;
}
@@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
- kmemleak_stop("Cannot insert 0x%lx into the object "
- "search tree (overlaps existing)\n",
+ kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
* No need for parent->lock here since "parent" cannot
@@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size)
object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx "
- "(size %zu)\n", ptr, size);
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
#endif
return;
}
@@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color)
object = find_and_get_object(ptr, 0);
if (!object) {
- kmemleak_warn("Trying to color unknown object "
- "at 0x%08lx as %s\n", ptr,
+ kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+ ptr,
(color == KMEMLEAK_GREY) ? "Grey" :
(color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
return;
@@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- pr_warning("Cannot allocate a scan area\n");
+ pr_warn("Cannot allocate a scan area\n");
goto out;
}
@@ -1463,8 +1462,8 @@ static void kmemleak_scan(void)
if (new_leaks) {
kmemleak_found_leaks = true;
- pr_info("%d new suspected memory leaks (see "
- "/sys/kernel/debug/kmemleak)\n", new_leaks);
+ pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+ new_leaks);
}
}
@@ -1515,7 +1514,7 @@ static void start_scan_thread(void)
return;
scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
if (IS_ERR(scan_thread)) {
- pr_warning("Failed to create the scan thread\n");
+ pr_warn("Failed to create the scan thread\n");
scan_thread = NULL;
}
}
@@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work)
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
- pr_info("Kmemleak disabled without freeing internal data. "
- "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1874,8 +1872,8 @@ void __init kmemleak_init(void)
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warning("Early log buffer exceeded (%d), please increase "
- "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+ pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
+ crt_early_log);
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
@@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void)
dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
&kmemleak_fops);
if (!dentry)
- pr_warning("Failed to create the debugfs kmemleak file\n");
+ pr_warn("Failed to create the debugfs kmemleak file\n");
mutex_lock(&scan_mutex);
start_scan_thread();
mutex_unlock(&scan_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index ca6d2a06a615..4786b4150f62 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ *
+ * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * of the process that owns 'vma'. We also do not want to enforce
+ * protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
@@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
- page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ page = follow_page(vma, addr,
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE |
+ FAULT_FLAG_REMOTE);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -777,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+ up_read(&mm->mmap_sem);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -788,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
free_mm_slot(mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
mmdrop(mm);
- } else {
+ } else
spin_unlock(&ksm_mmlist_lock);
- up_read(&mm->mmap_sem);
- }
}
/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1657,8 +1661,15 @@ next_mm:
up_read(&mm->mmap_sem);
mmdrop(mm);
} else {
- spin_unlock(&ksm_mmlist_lock);
up_read(&mm->mmap_sem);
+ /*
+ * up_read(&mm->mmap_sem) first because after
+ * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+ * already have been freed under us by __ksm_exit()
+ * because the "mm_slot" is still hashed and
+ * ksm_scan.mm_slot doesn't point to it anymore.
+ */
+ spin_unlock(&ksm_mmlist_lock);
}
/* Repeat until we've completed scanning the whole list */
diff --git a/mm/madvise.c b/mm/madvise.c
index a01147359f3b..07427d3fcead 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -170,7 +170,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
vma, index);
if (page)
- page_cache_release(page);
+ put_page(page);
}
return 0;
@@ -204,14 +204,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
page = find_get_entry(mapping, index);
if (!radix_tree_exceptional_entry(page)) {
if (page)
- page_cache_release(page);
+ put_page(page);
continue;
}
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0);
if (page)
- page_cache_release(page);
+ put_page(page);
}
lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/memblock.c b/mm/memblock.c
index fc7824fa1b42..b570dddb4cb9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 42882c1e7fce..fe787f5c41bd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -207,6 +207,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
+ struct mm_struct *mm;
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long flags;
@@ -638,9 +639,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
int zid;
@@ -1151,12 +1151,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /* oom_info_lock ensures that parallel ooms do not interleave */
- static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
- mutex_lock(&oom_info_lock);
rcu_read_lock();
if (p) {
@@ -1200,7 +1197,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont("\n");
}
- mutex_unlock(&oom_info_lock);
}
/*
@@ -1237,7 +1233,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
return limit;
}
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct oom_control oc = {
@@ -1315,6 +1311,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
}
unlock:
mutex_unlock(&oom_lock);
+ return chosen;
}
#if MAX_NUMNODES > 1
@@ -2325,9 +2322,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_online(memcg))
- return 0;
-
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
@@ -2346,10 +2340,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
- int ret;
+ int ret = 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!mem_cgroup_is_root(memcg))
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
@@ -2719,39 +2714,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
+ memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += mem_cgroup_read_stat(iter, i);
+ }
}
-static unsigned long tree_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_events(iter, idx);
+ memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ events[i] += mem_cgroup_read_events(iter, i);
+ }
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val;
+ unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_SWAP);
+ }
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -2817,6 +2821,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
+ if (cgroup_memory_nokmem)
+ return 0;
+
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
@@ -2837,24 +2844,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
}
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
- struct mem_cgroup *memcg)
-{
- int ret = 0;
-
- mutex_lock(&memcg_limit_mutex);
- /*
- * If the parent cgroup is not kmem-online now, it cannot be
- * onlined after this point, because it has at least one child
- * already.
- */
- if (memcg_kmem_online(parent) ||
- (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
- ret = memcg_online_kmem(memcg);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
-}
-
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
@@ -2913,10 +2902,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
}
#else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
- return 0;
-}
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
@@ -2932,22 +2917,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret = 0;
+ int ret;
mutex_lock(&memcg_limit_mutex);
- /* Top-level cgroup doesn't propagate from root */
- if (!memcg_kmem_online(memcg)) {
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- ret = -EBUSY;
- if (ret)
- goto out;
- ret = memcg_online_kmem(memcg);
- if (ret)
- goto out;
- }
ret = page_counter_limit(&memcg->kmem, limit);
-out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4198,7 +4171,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- error = memcg_propagate_kmem(parent, memcg);
+ error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -4282,9 +4255,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
- mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
- memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4693,6 +4668,8 @@ static void __mem_cgroup_clear_mc(void)
static void mem_cgroup_clear_mc(void)
{
+ struct mm_struct *mm = mc.mm;
+
/*
* we must clear moving_task before waking up waiters at the end of
* task migration.
@@ -4702,7 +4679,10 @@ static void mem_cgroup_clear_mc(void)
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
+ mc.mm = NULL;
spin_unlock(&mc.lock);
+
+ mmput(mm);
}
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4759,6 +4739,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
VM_BUG_ON(mc.moved_swap);
spin_lock(&mc.lock);
+ mc.mm = mm;
mc.from = from;
mc.to = memcg;
mc.flags = move_flags;
@@ -4768,8 +4749,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
+ } else {
+ mmput(mm);
}
- mmput(mm);
return ret;
}
@@ -4878,11 +4860,11 @@ put: /* get_mctgt_type() gets the page */
return ret;
}
-static void mem_cgroup_move_charge(struct mm_struct *mm)
+static void mem_cgroup_move_charge(void)
{
struct mm_walk mem_cgroup_move_charge_walk = {
.pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
+ .mm = mc.mm,
};
lru_add_drain_all();
@@ -4894,7 +4876,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
retry:
- if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
/*
* Someone who are holding the mmap_sem might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
@@ -4911,23 +4893,16 @@ retry:
* additional charge, the page walk just aborts.
*/
walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
- up_read(&mm->mmap_sem);
+ up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
-static void mem_cgroup_move_task(struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(void)
{
- struct cgroup_subsys_state *css;
- struct task_struct *p = cgroup_taskset_first(tset, &css);
- struct mm_struct *mm = get_task_mm(p);
-
- if (mm) {
- if (mc.to)
- mem_cgroup_move_charge(mm);
- mmput(mm);
- }
- if (mc.to)
+ if (mc.to) {
+ mem_cgroup_move_charge();
mem_cgroup_clear_mc();
+ }
}
#else /* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4937,7 +4912,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
}
-static void mem_cgroup_move_task(struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(void)
{
}
#endif
@@ -5015,6 +4990,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
unsigned long high;
int err;
@@ -5025,6 +5001,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ nr_pages = page_counter_read(&memcg->memory);
+ if (nr_pages > high)
+ try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5046,6 +5027,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long max;
int err;
@@ -5054,9 +5037,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- err = mem_cgroup_resize_limit(memcg, max);
- if (err)
- return err;
+ xchg(&memcg->memory.limit, max);
+
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+ if (nr_pages <= max)
+ break;
+
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ if (nr_reclaims) {
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+ GFP_KERNEL, true))
+ nr_reclaims--;
+ continue;
+ }
+
+ mem_cgroup_events(memcg, MEMCG_OOM, 1);
+ if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ break;
+ }
memcg_wb_domain_size_changed(memcg);
return nbytes;
@@ -5077,6 +5087,8 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[MEMCG_NR_EVENTS];
int i;
/*
@@ -5090,22 +5102,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
+ tree_stat(memcg, stat);
+ tree_events(memcg, events);
+
seq_printf(m, "anon %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ seq_printf(m, "kernel_stack %llu\n",
+ (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ seq_printf(m, "slab %llu\n",
+ (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+ stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+ (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5117,12 +5134,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
+ seq_printf(m, "slab_reclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ seq_printf(m, "slab_unreclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+ events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
return 0;
}
@@ -5174,7 +5196,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
- .attach = mem_cgroup_move_task,
+ .post_attach = mem_cgroup_move_task,
.bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
@@ -5395,6 +5417,10 @@ static void uncharge_list(struct list_head *page_list)
struct list_head *next;
struct page *page;
+ /*
+ * Note that the list can be a single page->lru; hence the
+ * do-while loop instead of a simple list_for_each_entry().
+ */
next = page_list->next;
do {
unsigned int nr_pages = 1;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 67c30eb993f0..ca5acee53b7a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- printk(KERN_ERR
- "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
- t->comm, t->pid, ret);
+ pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
return ret;
}
@@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- printk(KERN_ERR
- "MCE: Out of memory while machine check handling\n");
+ pr_err("MCE: Out of memory while machine check handling\n");
return;
}
}
@@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- printk(KERN_ERR
- "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- printk(KERN_ERR
- "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
kfree(tk);
@@ -542,7 +538,7 @@ static int delete_from_lru_cache(struct page *p)
/*
* drop the page count elevated by isolate_lru_page()
*/
- page_cache_release(p);
+ put_page(p);
return 0;
}
return -EIO;
@@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ pr_err("MCE %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
@@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("MCE %#lx: Failed to invalidate\n", pfn);
}
return ret;
}
@@ -854,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- printk(KERN_ERR
- "MCE %#lx: %s still referenced by %d users\n",
+ pr_err("MCE %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -894,7 +888,15 @@ int get_hwpoison_page(struct page *page)
}
}
- return get_page_unless_zero(head);
+ if (get_page_unless_zero(head)) {
+ if (head == compound_head(page))
+ return 1;
+
+ pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page));
+ put_page(head);
+ }
+
+ return 0;
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
@@ -934,8 +936,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
if (PageSwapCache(p)) {
- printk(KERN_ERR
- "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -953,8 +954,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- printk(KERN_INFO
- "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -972,8 +972,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1040,16 +1040,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- printk(KERN_ERR
- "MCE %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("MCE %#lx: memory outside kernel control\n", pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
@@ -1180,7 +1178,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("MCE %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
diff --git a/mm/memory.c b/mm/memory.c
index 0e247642ed5b..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
#include <linux/userfaultfd_k.h>
#include <asm/io.h>
+#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -562,8 +563,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
@@ -661,9 +661,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
return;
}
if (nr_unshown) {
- printk(KERN_ALERT
- "BUG: Bad page map: %lu messages suppressed\n",
- nr_unshown);
+ pr_alert("BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
@@ -674,15 +673,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- printk(KERN_ALERT
- "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- printk(KERN_ALERT
- "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
@@ -792,6 +789,46 @@ out:
return pfn_to_page(pfn);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t pmd)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ /*
+ * There is no pmd_special() but there may be special pmds, e.g.
+ * in a direct-access (dax) mapping, so let's just replicate the
+ * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+ */
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn))
+ return NULL;
+ if (unlikely(pfn > highest_memmap_pfn))
+ return NULL;
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page tables.
+ * eg. VDSO mappings can cause them to exist.
+ */
+out:
+ return pfn_to_page(pfn);
+}
+#endif
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
@@ -1105,6 +1142,12 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
+ /*
+ * oom_reaper cannot tear down dirty
+ * pages
+ */
+ if (unlikely(details && details->ignore_dirty))
+ continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1123,8 +1166,8 @@ again:
}
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
+ /* only check swap_entries if explicitly asked for in details */
+ if (unlikely(details && !details->check_swap_entries))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1179,15 +1222,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
-#ifdef CONFIG_DEBUG_VM
- if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
- pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
- __func__, addr, end,
- vma->vm_start,
- vma->vm_end);
- BUG();
- }
-#endif
+ VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
+ !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
@@ -1229,7 +1265,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
@@ -1237,9 +1273,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping)
- details = NULL;
-
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -2054,7 +2087,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
VM_BUG_ON_PAGE(PageAnon(page), page);
mapping = page->mapping;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
if ((dirtied || page_mkwrite) && mapping) {
/*
@@ -2188,7 +2221,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
}
if (new_page)
- page_cache_release(new_page);
+ put_page(new_page);
pte_unmap_unlock(page_table, ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2203,14 +2236,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
munlock_vma_page(old_page);
unlock_page(old_page);
}
- page_cache_release(old_page);
+ put_page(old_page);
}
return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
- page_cache_release(new_page);
+ put_page(new_page);
oom:
if (old_page)
- page_cache_release(old_page);
+ put_page(old_page);
return VM_FAULT_OOM;
}
@@ -2258,7 +2291,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
{
int page_mkwrite = 0;
- page_cache_get(old_page);
+ get_page(old_page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
@@ -2267,7 +2300,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
tmp = do_page_mkwrite(vma, old_page, address);
if (unlikely(!tmp || (tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
+ put_page(old_page);
return tmp;
}
/*
@@ -2281,7 +2314,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
- page_cache_release(old_page);
+ put_page(old_page);
return 0;
}
page_mkwrite = 1;
@@ -2340,8 +2373,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
+ int total_mapcount;
if (!trylock_page(old_page)) {
- page_cache_get(old_page);
+ get_page(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
@@ -2349,18 +2383,23 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
- page_cache_release(old_page);
+ put_page(old_page);
return 0;
}
- page_cache_release(old_page);
+ put_page(old_page);
}
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
+ if (reuse_swap_page(old_page, &total_mapcount)) {
+ if (total_mapcount == 1) {
+ /*
+ * The page is all ours. Move it to
+ * our anon_vma so the rmap code will
+ * not search our parent or siblings.
+ * Protected against the rmap code by
+ * the page lock.
+ */
+ page_move_anon_rmap(compound_head(old_page),
+ vma, address);
+ }
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);
@@ -2375,7 +2414,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Ok, we need to copy. Oh, well..
*/
- page_cache_get(old_page);
+ get_page(old_page);
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
@@ -2400,7 +2439,6 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
zba = details->first_index;
if (zba < vba)
zba = vba;
@@ -2435,7 +2473,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details;
+ struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -2585,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
inc_mm_counter_fast(mm, MM_ANONPAGES);
dec_mm_counter_fast(mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+ if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -2619,7 +2657,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
* parallel locked swapcache.
*/
unlock_page(swapcache);
- page_cache_release(swapcache);
+ put_page(swapcache);
}
if (flags & FAULT_FLAG_WRITE) {
@@ -2641,10 +2679,10 @@ out_nomap:
out_page:
unlock_page(page);
out_release:
- page_cache_release(page);
+ put_page(page);
if (page != swapcache) {
unlock_page(swapcache);
- page_cache_release(swapcache);
+ put_page(swapcache);
}
return ret;
}
@@ -2752,7 +2790,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
@@ -2771,10 +2809,10 @@ unlock:
return 0;
release:
mem_cgroup_cancel_charge(page, memcg, false);
- page_cache_release(page);
+ put_page(page);
goto unlock;
oom_free_page:
- page_cache_release(page);
+ put_page(page);
oom:
return VM_FAULT_OOM;
}
@@ -2807,7 +2845,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
unlock_page(vmf.page);
- page_cache_release(vmf.page);
+ put_page(vmf.page);
return VM_FAULT_HWPOISON;
}
@@ -2996,7 +3034,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, false, false);
@@ -3024,7 +3062,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
- page_cache_release(new_page);
+ put_page(new_page);
return VM_FAULT_OOM;
}
@@ -3041,7 +3079,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
@@ -3057,7 +3095,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
if (fault_page) {
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
} else {
/*
* The fault handler has no page to lock, so it holds
@@ -3068,7 +3106,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg, false);
- page_cache_release(new_page);
+ put_page(new_page);
return ret;
}
@@ -3096,7 +3134,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
tmp = do_page_mkwrite(vma, fault_page, address);
if (unlikely(!tmp ||
(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(fault_page);
+ put_page(fault_page);
return tmp;
}
}
@@ -3105,7 +3143,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
- page_cache_release(fault_page);
+ put_page(fault_page);
return ret;
}
do_set_pte(vma, address, fault_page, pte, true, false);
@@ -3379,6 +3417,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *pte;
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3419,12 +3462,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * Use pte_alloc() instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pte_alloc(mm, pmd, address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
@@ -3696,7 +3738,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
@@ -3732,7 +3774,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
buf, maddr + offset, bytes);
}
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
len -= bytes;
buf += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 24ea06393816..aa34431c3f31 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -166,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page,
page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
@@ -177,7 +178,7 @@ void put_page_bootmem(struct page *page)
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
- if (atomic_dec_return(&page->_count) == 1) {
+ if (page_ref_dec_return(page) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -1054,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret) {
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
- }
+ if (ret)
+ goto failed_addition;
+
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
@@ -1079,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
- printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (((unsigned long long) pfn + nr_pages)
- << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
+ goto failed_addition;
}
zone->present_pages += onlined_pages;
@@ -1094,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1105,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1115,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
return 0;
+
+failed_addition:
+ pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1526,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
#ifdef CONFIG_DEBUG_VM
- printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
- pfn);
+ pr_alert("removing pfn %lx from LRU failed\n", pfn);
dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
@@ -1855,7 +1858,7 @@ repeat:
ret = -EBUSY;
goto failed_removal;
}
- printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
@@ -1880,8 +1883,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -1890,9 +1895,9 @@ repeat:
return 0;
failed_removal:
- printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
@@ -1965,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
- pr_warn("removing memory fails, because memory "
- "[%pa-%pa] is onlined\n",
+ pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8cbc74387df3..36cc01bc950a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -846,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(struct mm_struct *mm, unsigned long addr)
+static int lookup_node(unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+ err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
@@ -906,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(mm, addr);
+ err = lookup_node(addr);
if (err < 0)
goto out;
*policy = err;
@@ -2559,9 +2559,7 @@ static void __init check_numabalancing_enable(void)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 7924f4f58a6d..9b7a14a791cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
{
if (pool->alloc == mempool_alloc_slab)
- kasan_slab_alloc(pool->pool_data, element);
+ kasan_slab_alloc(pool->pool_data, element, flags);
if (pool->alloc == mempool_kmalloc)
- kasan_krealloc(element, (size_t)pool->pool_data);
+ kasan_krealloc(element, (size_t)pool->pool_data, flags);
if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element);
+ kasan_unpoison_element(pool, element, flags);
check_element(pool, element);
return element;
}
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
return;
while (pool->curr_nr) {
- void *element = remove_element(pool);
+ void *element = remove_element(pool, GFP_KERNEL);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool);
+ element = remove_element(pool, GFP_KERNEL);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
+ * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
*/
-void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
gfp_t gfp_temp;
+ /* If oom killed, memory reserves are essential to prevent livelock */
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
+ /* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
- gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
+ if (likely(pool->curr_nr)) {
+ /*
+ * Don't allocate from emergency reserves if there are
+ * elements available. This check is racy, but it will
+ * be rechecked each loop.
+ */
+ gfp_temp |= __GFP_NOMEMALLOC;
+ }
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
@@ -336,7 +347,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool);
+ element = remove_element(pool, gfp_temp);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
@@ -352,11 +363,12 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
- if (gfp_temp != gfp_mask) {
+ if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
+ gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 568284ec75d4..f9dfb18a4eba 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -172,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
else
page_add_file_rmap(new);
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
/* No need to invalidate - it was non-present before */
@@ -187,14 +187,17 @@ out:
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-static void remove_migration_ptes(struct page *old, struct page *new)
+void remove_migration_ptes(struct page *old, struct page *new, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
};
- rmap_walk(new, &rwc);
+ if (locked)
+ rmap_walk_locked(new, &rwc);
+ else
+ rmap_walk(new, &rwc);
}
/*
@@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
- page_unfreeze_refs(page, expected_count);
+ page_ref_unfreeze(page, expected_count);
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -397,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock(&mapping->tree_lock);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -451,7 +454,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -463,7 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
@@ -702,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page);
+ remove_migration_ptes(page, page, false);
rc = mapping->a_ops->writepage(page, &wbc);
@@ -900,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (page_was_mapped)
remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
out_unlock_both:
unlock_page(newpage);
@@ -972,7 +975,13 @@ out:
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) {
+ /*
+ * With this release, we free successfully migrated
+ * page and set PG_HWPoison on just freed page
+ * intentionally. Although it's rather weird, it's how
+ * HWPoison flag works at the moment.
+ */
put_page(page);
if (!test_set_page_hwpoison(page))
num_poisoned_pages_inc();
@@ -1070,7 +1079,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
unlock_page(new_hpage);
@@ -1773,7 +1782,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
-
+ /*
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
if (mm_tlb_flush_pending(mm))
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1829,12 +1841,11 @@ fail_putback:
page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
+ flush_pmd_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page, true);
diff --git a/mm/mincore.c b/mm/mincore.c
index 563f32045490..c0b5ba965200 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
#endif
if (page) {
present = PageUptodate(page);
- page_cache_release(page);
+ put_page(page);
}
return present;
@@ -211,7 +211,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
* return values:
* zero - success
* -EFAULT - vec points to an illegal address
- * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
+ * -EINVAL - addr is not a multiple of PAGE_SIZE
* -ENOMEM - Addresses in the range [addr, addr + len] are
* invalid for the address space of this process, or
* specify one or more pages which are not currently
@@ -226,14 +226,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
unsigned char *tmp;
/* Check the start address: needs to be page-aligned.. */
- if (start & ~PAGE_CACHE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
if (!access_ok(VERIFY_READ, (void __user *) start, len))
return -ENOMEM;
- /* This also avoids any overflows on PAGE_CACHE_ALIGN */
+ /* This also avoids any overflows on PAGE_ALIGN */
pages = len >> PAGE_SHIFT;
pages += (offset_in_page(len)) != 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fdadf918de76..5b72266b4b03 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void)
/* Iterate the zonelist */
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
#ifdef CONFIG_NUMA
- printk(KERN_CONT "%d:%s ",
- zone->node, zone->name);
+ pr_cont("%d:%s ", zone->node, zone->name);
#else
- printk(KERN_CONT "0:%s ", zone->name);
+ pr_cont("0:%s ", zone->name);
#endif /* CONFIG_NUMA */
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 90e3b869a8b9..bd2e1a533bc1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -37,12 +37,12 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
-#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -123,130 +123,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
}
}
-
-int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-/*
- * Make sure vm_committed_as in one cacheline and not cacheline shared with
- * other variables. It can be updated by several CPUs frequently.
- */
-struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
-
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
/*
* Requires inode->i_mapping->i_mmap_rwsem
*/
@@ -1270,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long pgoff, unsigned long *populate)
{
struct mm_struct *mm = current->mm;
+ int pkey = 0;
*populate = 0;
@@ -1309,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(mm);
+ if (pkey < 0)
+ pkey = 0;
+ }
+
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -2642,9 +2525,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
if (prot)
return ret;
@@ -3010,8 +2892,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
- "%lu. Will be forbidden soon.\n",
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5fbdd367bbed..f4259e496f83 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f7cb3d4d9c2e..b650c5412f58 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -354,10 +355,13 @@ fail:
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
- unsigned long vm_flags, nstart, end, tmp, reqprot;
+ unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
+ (prot & PROT_READ);
+
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
@@ -374,13 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return -EINVAL;
reqprot = prot;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC:
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- prot |= PROT_EXEC;
-
- vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
@@ -411,10 +408,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
+ int pkey = arch_override_mprotect_pkey(vma, prot, -1);
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags;
+ /* Does the application expect PROT_READ to imply PROT_EXEC */
+ if (rier && (vma->vm_flags & VM_MAYEXEC))
+ prot |= PROT_EXEC;
+
+ newflags = calc_vm_prot_bits(prot, pkey);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
@@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -ENOMEM;
goto out;
}
+ prot = reqprot;
}
out:
up_write(&current->mm->mmap_sem);
diff --git a/mm/mremap.c b/mm/mremap.c
index 8eeba02fc991..3fa0a467df66 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -20,7 +20,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
@@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
- if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
- new_pmd, new_addr))
+ if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 99feb2b07fc5..bd05a70f44b9 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index fbf6f0f1d6c9..c8bd59a03c71 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
-#include <linux/sched/sysctl.h>
#include <linux/printk.h>
#include <asm/uaccess.h>
@@ -48,33 +47,11 @@ struct page *mem_map;
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-struct percpu_counter vm_committed_as;
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
@@ -162,7 +139,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (pages) {
pages[i] = virt_to_page(start);
if (pages[i])
- page_cache_get(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
vmas[i] = vma;
@@ -182,8 +159,7 @@ finish_or_fault:
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
@@ -194,18 +170,16 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages(current, current->mm, start, nr_pages, flags,
+ pages, vmas, NULL);
}
EXPORT_SYMBOL(get_user_pages);
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- int write, int force, struct page **pages,
- int *locked)
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
{
- return get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ return get_user_pages(start, nr_pages, write, force, pages, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);
@@ -216,19 +190,18 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
{
long ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
+ NULL, NULL);
up_read(&mm->mmap_sem);
return ret;
}
EXPORT_SYMBOL(__get_user_pages_unlocked);
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, 0);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, 0);
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -1084,7 +1057,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
if (!(capabilities & NOMMU_MAP_DIRECT)) {
@@ -1829,100 +1802,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some 3% for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e97a05d9621f..86349586eacb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -287,9 +292,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
-
return OOM_SCAN_OK;
}
@@ -386,8 +388,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
- "oom_score_adj=%hd\n",
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
@@ -409,6 +410,176 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static struct task_struct *oom_reaper_list;
+static DEFINE_SPINLOCK(oom_reaper_lock);
+
+
+static bool __oom_reap_task(struct task_struct *tsk)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct task_struct *p;
+ struct zap_details details = {.check_swap_entries = true,
+ .ignore_dirty = true};
+ bool ret = true;
+
+ /*
+ * Make sure we find the associated mm_struct even when the particular
+ * thread has already terminated and cleared its mm.
+ * We might have race with exit path so consider our work done if there
+ * is no mm.
+ */
+ p = find_lock_task_mm(tsk);
+ if (!p)
+ return true;
+
+ mm = p->mm;
+ if (!atomic_inc_not_zero(&mm->mm_users)) {
+ task_unlock(p);
+ return true;
+ }
+
+ task_unlock(p);
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ ret = false;
+ goto out;
+ }
+
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /*
+ * mlocked VMAs require explicit munlocking before unmap.
+ * Let's keep it simple here and skip such VMAs.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+ &details);
+ }
+ tlb_finish_mmu(&tlb, 0, -1);
+ pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ task_pid_nr(tsk), tsk->comm,
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore. OOM killer can continue
+ * by selecting other victim if unmapping hasn't led to any
+ * improvements. This also means that selecting this task doesn't
+ * make any sense.
+ */
+ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+ exit_oom_victim(tsk);
+out:
+ mmput(mm);
+ return ret;
+}
+
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+ int attempts = 0;
+
+ /* Retry the down_read_trylock(mmap_sem) a few times */
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+ schedule_timeout_idle(HZ/10);
+
+ if (attempts > MAX_OOM_REAP_RETRIES) {
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
+ }
+
+ /* Drop a reference taken by wake_oom_reaper */
+ put_task_struct(tsk);
+}
+
+static int oom_reaper(void *unused)
+{
+ set_freezable();
+
+ while (true) {
+ struct task_struct *tsk = NULL;
+
+ wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+ spin_lock(&oom_reaper_lock);
+ if (oom_reaper_list != NULL) {
+ tsk = oom_reaper_list;
+ oom_reaper_list = tsk->oom_reaper_list;
+ }
+ spin_unlock(&oom_reaper_lock);
+
+ if (tsk)
+ oom_reap_task(tsk);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+ if (!oom_reaper_th)
+ return;
+
+ /* tsk is already queued? */
+ if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ return;
+
+ get_task_struct(tsk);
+
+ spin_lock(&oom_reaper_lock);
+ tsk->oom_reaper_list = oom_reaper_list;
+ oom_reaper_list = tsk;
+ spin_unlock(&oom_reaper_lock);
+ wake_up(&oom_reaper_wait);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ if (IS_ERR(oom_reaper_th)) {
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+ PTR_ERR(oom_reaper_th));
+ oom_reaper_th = NULL;
+ }
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
@@ -435,9 +606,10 @@ void mark_oom_victim(struct task_struct *tsk)
/**
* exit_oom_victim - note the exit of an OOM victim
*/
-void exit_oom_victim(void)
+void exit_oom_victim(struct task_struct *tsk)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
@@ -459,15 +631,11 @@ void exit_oom_victim(void)
bool oom_killer_disable(void)
{
/*
- * Make sure to not race with an ongoing OOM killer
- * and that the current is not the victim.
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
*/
- mutex_lock(&oom_lock);
- if (test_thread_flag(TIF_MEMDIE)) {
- mutex_unlock(&oom_lock);
+ if (mutex_lock_killable(&oom_lock))
return false;
- }
-
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
@@ -502,7 +670,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
@@ -518,6 +685,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ bool can_oom_reap = true;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -608,17 +776,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD))
- continue;
- if (is_global_init(p))
- continue;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ /*
+ * We cannot use oom_reaper for the mm shared by this
+ * process because it wouldn't get killed and so the
+ * memory might be still used.
+ */
+ can_oom_reap = false;
continue;
-
+ }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
+ if (can_oom_reap)
+ wake_oom_reaper(victim);
+
mmdrop(mm);
put_task_struct(victim);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 11ff8f758631..bc5149d5ec38 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1910,7 +1910,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (gdtc->dirty > gdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(gdtc->wb, gdtc->bg_thresh))
return true;
if (mdtc) {
@@ -1924,7 +1925,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
if (mdtc->dirty > mdtc->bg_thresh)
return true;
- if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+ if (wb_stat(wb, WB_RECLAIMABLE) >
+ wb_calc_thresh(mdtc->wb, mdtc->bg_thresh))
return true;
}
@@ -2176,8 +2178,8 @@ int write_cache_pages(struct address_space *mapping,
cycled = 0;
end = -1;
} else {
- index = wbc->range_start >> PAGE_CACHE_SHIFT;
- end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
cycled = 1; /* ignore range_cyclic tests */
@@ -2382,14 +2384,14 @@ int write_one_page(struct page *page, int wait)
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
- page_cache_get(page);
+ get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
if (PageError(page))
ret = -EIO;
}
- page_cache_release(page);
+ put_page(page);
} else {
unlock_page(page);
}
@@ -2431,7 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
__inc_wb_stat(wb, WB_DIRTIED);
- task_io_account_write(PAGE_CACHE_SIZE);
+ task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
}
@@ -2450,7 +2452,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
- task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ task_io_account_cancelled_write(PAGE_SIZE);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c46b75d14b6f..c1069efcc4d7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -249,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -307,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
+ unsigned long max_initialise;
+
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
- /* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > max_initialise) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -498,6 +506,7 @@ void prep_compound_page(struct page *page, unsigned int order)
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -542,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
- printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -683,34 +692,28 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- unsigned int max_order = MAX_ORDER;
+ unsigned int max_order;
+
+ max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
- if (is_migrate_isolate(migratetype)) {
- /*
- * We restrict max order of merging to prevent merge
- * between freepages on isolate pageblock and normal
- * pageblock. Without this, pageblock isolation
- * could cause incorrect freepage accounting.
- */
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
- } else {
+ if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- }
- page_idx = pfn & ((1 << max_order) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
+continue_merging:
while (order < max_order - 1) {
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
- break;
+ goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -727,6 +730,32 @@ static inline void __free_one_page(struct page *page,
page_idx = combined_idx;
order++;
}
+ if (max_order < MAX_ORDER) {
+ /* If we are here, it means order is >= pageblock_order.
+ * We want to prevent merge between freepages on isolate
+ * pageblock and normal pageblock. Without this, pageblock
+ * isolation could cause incorrect freepage or CMA accounting.
+ *
+ * We don't want to hit this code for the more frequent
+ * low-order merging.
+ */
+ if (unlikely(has_isolate_pageblock(zone))) {
+ int buddy_mt;
+
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
+ buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (is_migrate_isolate(migratetype) ||
+ is_migrate_isolate(buddy_mt)))
+ goto done_merging;
+ }
+ max_order++;
+ goto continue_merging;
+ }
+
+done_merging:
set_page_order(page, order);
/*
@@ -764,7 +793,7 @@ static inline int free_pages_check(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1460,7 +1489,7 @@ static inline int check_new_page(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -2348,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
list_del(&page->lru);
pcp->count--;
} else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
page = NULL;
@@ -2857,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -3117,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
- /*
- * If this allocation cannot block and it is for a specific node, then
- * fail early. There's no need to wakeup kswapd or retry for a
- * speculative node-specific allocation.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
- goto nopage;
-
retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3481,7 +3498,7 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(size - 1, &page->_count);
+ page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3493,7 +3510,7 @@ refill:
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3501,7 +3518,7 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
+ set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
@@ -3712,6 +3729,49 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
@@ -4044,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s)
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
- printk(KERN_WARNING
- "Ignoring invalid numa_zonelist_order value: "
- "%s\n", s);
+ pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4510,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. "
- "Total pages: %ld\n",
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? "off" : "on",
- vm_total_pages);
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
@@ -5404,6 +5461,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -5428,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
@@ -5637,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
- printk(KERN_WARNING
- "Could not find start_pfn for node %d\n", nid);
+ pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -6110,22 +6168,21 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- pr_info("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
+ ", %luK highmem"
#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
- totalcma_pages << (PAGE_SHIFT-10),
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
+ totalhigh_pages << (PAGE_SHIFT - 10),
#endif
- str ? ", " : "", str ? str : "");
+ str ? ", " : "", str ? str : "");
}
/**
@@ -6300,8 +6357,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6419,7 +6485,7 @@ int __meminit init_per_zone_wmark_min(void)
setup_per_zone_inactive_ratio();
return 0;
}
-module_init(init_per_zone_wmark_min)
+core_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -6442,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -6633,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename,
- (1UL << log2qty),
- ilog2(size) - PAGE_SHIFT,
- size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
if (_hash_shift)
*_hash_shift = log2qty;
@@ -6788,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
- if (!atomic_read(&page->_count)) {
+ if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
@@ -7138,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
+ pr_info("remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
@@ -7152,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -7171,4 +7248,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index b995a5ba5e8f..985f23cfa79b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -56,31 +56,20 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
}
-static void end_swap_bio_read(struct bio *bio)
+static void swap_slot_free_notify(struct page *page)
{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- if (bio->bi_error) {
- SetPageError(page);
- ClearPageUptodate(page);
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
- goto out;
- }
-
- SetPageUptodate(page);
+ struct swap_info_struct *sis;
+ struct gendisk *disk;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -88,42 +77,59 @@ static void end_swap_bio_read(struct bio *bio)
* swapcache page. So we must check PG_swapcache before proceeding with
* this optimization.
*/
- if (likely(PageSwapCache(page))) {
- struct swap_info_struct *sis;
+ if (unlikely(!PageSwapCache(page)))
+ return;
- sis = page_swap_info(page);
- if (sis->flags & SWP_BLKDEV) {
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- struct gendisk *disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
- unsigned long offset;
+ sis = page_swap_info(page);
+ if (!(sis->flags & SWP_BLKDEV))
+ return;
- entry.val = page_private(page);
- offset = swp_offset(entry);
+ /*
+ * The swap subsystem performs lazy swap slot freeing,
+ * expecting that the page will be swapped out again.
+ * So we can avoid an unnecessary write if the page
+ * isn't redirtied.
+ * This is good for real swap storage because we can
+ * reduce unnecessary I/O and enhance wear-leveling
+ * if an SSD is used as the as swap device.
+ * But if in-memory swap device (eg zram) is used,
+ * this causes a duplicated copy between uncompressed
+ * data in VM-owned memory and compressed data in
+ * zram-owned memory. So let's free zram-owned memory
+ * and make the VM-owned decompressed page *dirty*,
+ * so the page should be swapped out somewhere again if
+ * we again wish to reclaim it.
+ */
+ disk = sis->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify) {
+ swp_entry_t entry;
+ unsigned long offset;
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
- }
+ entry.val = page_private(page);
+ offset = swp_offset(entry);
+
+ SetPageDirty(page);
+ disk->fops->swap_slot_free_notify(sis->bdev,
+ offset);
}
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+ if (bio->bi_error) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto out;
+ }
+
+ SetPageUptodate(page);
+ swap_slot_free_notify(page);
out:
unlock_page(page);
bio_put(bio);
@@ -216,7 +222,7 @@ reprobe:
out:
return ret;
bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
+ pr_err("swapon: swapfile has holes\n");
ret = -EINVAL;
goto out;
}
@@ -246,7 +252,7 @@ out:
static sector_t swap_page_sector(struct page *page)
{
- return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+ return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
}
int __swap_writepage(struct page *page, struct writeback_control *wbc,
@@ -290,8 +296,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
*/
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
- page_file_offset(page));
+ pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
+ page_file_offset(page));
}
end_page_writeback(page);
return ret;
@@ -347,6 +353,11 @@ int swap_readpage(struct page *page)
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
+ if (trylock_page(page)) {
+ swap_slot_free_notify(page);
+ unlock_page(page);
+ }
+
count_vm_event(PSWPIN);
return 0;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 92c4c36501e7..c4f568206544 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -215,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* all pages in [start_pfn...end_pfn) must be in the same zone.
* zone->lock must be held before call this.
*
- * Returns 1 if all pages in the range are isolated.
+ * Returns the last tested pfn.
*/
static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
@@ -289,11 +289,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
* now as a simple work-around, we use the next node for destination.
*/
if (PageHuge(page)) {
- nodemask_t src = nodemask_of_node(page_to_nid(page));
- nodemask_t dst;
- nodes_complement(dst, src);
+ int node = next_online_node(page_to_nid(page));
+ if (node == MAX_NUMNODES)
+ node = first_online_node;
return alloc_huge_page_node(page_hstate(compound_head(page)),
- next_node(page_to_nid(page), dst));
+ node);
}
if (PageHighMem(page))
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 44ad1f00c4e1..ac3d8d129974 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -198,9 +198,8 @@ void __dump_page_owner(struct page *page)
return;
}
- pr_alert("page allocated via order %u, migratetype %s, "
- "gfp_mask %#x(%pGg)\n", page_ext->order,
- migratetype_names[mt], gfp_mask, &gfp_mask);
+ pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
print_stack_trace(&trace, 0);
if (page_ext->last_migrate_reason != -1)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 10e3d0b8a86d..d66911ff42d9 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* all units must be in a single group */
if (ai->nr_groups != 1) {
- printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ pr_crit("can't handle more than one group\n");
return -EINVAL;
}
@@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
alloc_pages = roundup_pow_of_two(nr_pages);
if (alloc_pages > nr_pages)
- printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
- alloc_pages - nr_pages);
+ pr_warn("wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
return 0;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 998607adf6eb..0c59684f1ff2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -53,6 +53,8 @@
* setup the first chunk containing the kernel static percpu area
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
@@ -888,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
- WARN(true, "illegal size (%zu) or align (%zu) for "
- "percpu allocation\n", size, align);
+ WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
return NULL;
}
@@ -1033,11 +1035,11 @@ fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
if (!is_atomic && warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
- pr_info("PERCPU: limit reached, disable warning\n");
+ pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
/* see the flag handling in pcpu_blance_workfn() */
@@ -1449,20 +1451,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk(KERN_CONT "\n");
+ pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
- printk(KERN_CONT "[%0*d] ", group_width, group);
+ pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk(KERN_CONT "%0*d ", cpu_width,
- gi->cpu_map[unit]);
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
else
- printk(KERN_CONT "%s ", empty_str);
+ pr_cont("%s ", empty_str);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
/**
@@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
- pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
@@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str)
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
- pr_warning("PERCPU: unknown allocator %s specified\n", str);
+ pr_warn("unknown allocator %s specified\n", str);
return 0;
}
@@ -2016,9 +2018,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
- pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n", max_distance,
- VMALLOC_TOTAL);
+ pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -2026,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
#endif
}
- pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
@@ -2100,8 +2101,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate %s page "
- "for cpu%u\n", psize_str, cpu);
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
@@ -2140,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 06a005b979a7..71c5f9109f2a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-
-/*
- * ARCHes with special requirements for evicting THP backing TLB entries can
- * implement this. Otherwise also, it can help optimize normal TLB flush in
- * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB if flush span is greater than a threshold, which will
- * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desirable.
- * e.g. see arch/arc: flush_pmd_tlb_range
- */
-#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5d453e58ddbf..07514d41ebcc 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
int pages = min(nr_pages, max_pages_per_loop);
size_t bytes;
- /* Get the pages we're interested in */
- pages = get_user_pages_unlocked(task, mm, pa, pages,
- vm_write, 0, process_pages);
+ /*
+ * Get the pages we're interested in. We must
+ * add FOLL_REMOTE because task/mm might not
+ * current/current->mm
+ */
+ pages = __get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages,
+ FOLL_REMOTE);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 942212970529..daf6ff6e199a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -8,7 +8,7 @@
* improved on it.
*
* Copyright (C) 2007 SGI,
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
* Generalized, added support for multiple lists and
* constructors / destructors.
*/
diff --git a/mm/readahead.c b/mm/readahead.c
index 20e58e820e44..40be3ae0afe3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -47,11 +47,11 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
if (!trylock_page(page))
BUG();
page->mapping = mapping;
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -93,14 +93,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
read_cache_pages_invalidate_page(mapping, page);
continue;
}
- page_cache_release(page);
+ put_page(page);
ret = filler(data, page);
if (unlikely(ret)) {
read_cache_pages_invalidate_pages(mapping, pages);
break;
}
- task_io_account_read(PAGE_CACHE_SIZE);
+ task_io_account_read(PAGE_SIZE);
}
return ret;
}
@@ -130,7 +130,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
- page_cache_release(page);
+ put_page(page);
}
ret = 0;
@@ -163,7 +163,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (isize == 0)
goto out;
- end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+ end_index = ((isize - 1) >> PAGE_SHIFT);
/*
* Preallocate as many pages as we will need.
@@ -216,7 +216,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
while (nr_to_read) {
int err;
- unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+ unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
@@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping,
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+ prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
if (offset - prev_offset <= 1UL)
goto initial_readahead;
@@ -558,8 +558,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
if (f.file) {
if (f.file->f_mode & FMODE_READ) {
struct address_space *mapping = f.file->f_mapping;
- pgoff_t start = offset >> PAGE_CACHE_SHIFT;
- pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
unsigned long len = end - start + 1;
ret = do_readahead(mapping, f.file, start, len);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 02f0bfc3c80a..307b555024ef 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -569,19 +569,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
}
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-static void percpu_flush_tlb_batch_pages(void *data)
-{
- /*
- * All TLB entries are flushed on the assumption that it is
- * cheaper to flush all TLBs and let them be refilled than
- * flushing individual PFNs. Note that we do not track mm's
- * to flush as that might simply be multiple full TLB flushes
- * for no gain.
- */
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_local();
-}
-
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
* important if a PTE was dirty when it was unmapped that it's flushed
@@ -598,15 +585,14 @@ void try_to_unmap_flush(void)
cpu = get_cpu();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
-
- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
- percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
-
- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
- smp_call_function_many(&tlb_ubc->cpumask,
- percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+ if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+ local_flush_tlb();
+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
}
+
+ if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
+ flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
cpumask_clear(&tlb_ubc->cpumask);
tlb_ubc->flush_required = false;
tlb_ubc->writable = false;
@@ -1431,6 +1417,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
goto out;
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_MIGRATION, page);
+ /* check if we have anything to do after split */
+ if (page_mapcount(page) == 0)
+ goto out;
+ }
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1547,7 +1541,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
discard:
page_remove_rmap(page, PageHuge(page));
- page_cache_release(page);
+ put_page(page);
out_unmap:
pte_unmap_unlock(pte, ptl);
@@ -1576,10 +1570,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return is_vma_temporary_stack(vma);
}
-static int page_not_mapped(struct page *page)
+static int page_mapcount_is_zero(struct page *page)
{
- return !page_mapped(page);
-};
+ return !page_mapcount(page);
+}
/**
* try_to_unmap - try to remove all page table mappings to a page
@@ -1606,12 +1600,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = &rp,
- .done = page_not_mapped,
+ .done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
- VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1623,9 +1615,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
- ret = rmap_walk(page, &rwc);
+ if (flags & TTU_RMAP_LOCKED)
+ ret = rmap_walk_locked(page, &rwc);
+ else
+ ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page)) {
+ if (ret != SWAP_MLOCK && !page_mapcount(page)) {
ret = SWAP_SUCCESS;
if (rp.lazyfreed && !PageDirty(page))
ret = SWAP_LZFREE;
@@ -1633,6 +1628,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
return ret;
}
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1715,14 +1715,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (locked) {
+ anon_vma = page_anon_vma(page);
+ /* anon_vma disappear under us? */
+ VM_BUG_ON_PAGE(!anon_vma, page);
+ } else {
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ }
if (!anon_vma)
return ret;
@@ -1742,7 +1749,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (rwc->done && rwc->done(page))
break;
}
- anon_vma_unlock_read(anon_vma);
+
+ if (!locked)
+ anon_vma_unlock_read(anon_vma);
return ret;
}
@@ -1759,9 +1768,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1778,7 +1788,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
return ret;
pgoff = page_to_pgoff(page);
- i_mmap_lock_read(mapping);
+ if (!locked)
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1795,7 +1806,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
}
done:
- i_mmap_unlock_read(mapping);
+ if (!locked)
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -1804,9 +1816,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
if (unlikely(PageKsm(page)))
return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc);
+ return rmap_walk_anon(page, rwc, false);
+ else
+ return rmap_walk_file(page, rwc, false);
+}
+
+/* Like rmap_walk, but caller holds relevant rmap lock */
+int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+{
+ /* no ksm support for now */
+ VM_BUG_ON_PAGE(PageKsm(page), page);
+ if (PageAnon(page))
+ return rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc);
+ return rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/shmem.c b/mm/shmem.c
index 1acfdbc4bd9e..719bd6b88d98 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,8 +75,8 @@ static struct vfsmount *shm_mnt;
#include "internal.h"
-#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
-#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+#define BLOCKS_PER_PAGE (PAGE_SIZE/512)
+#define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT)
/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20
@@ -176,13 +176,13 @@ static inline int shmem_reacct_size(unsigned long flags,
static inline int shmem_acct_block(unsigned long flags)
{
return (flags & VM_NORESERVE) ?
- security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+ security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0;
}
static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
if (flags & VM_NORESERVE)
- vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+ vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}
static const struct super_operations shmem_ops;
@@ -300,7 +300,7 @@ static int shmem_add_to_page_cache(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- page_cache_get(page);
+ get_page(page);
page->mapping = mapping;
page->index = index;
@@ -318,7 +318,7 @@ static int shmem_add_to_page_cache(struct page *page,
} else {
page->mapping = NULL;
spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
+ put_page(page);
}
return error;
}
@@ -338,7 +338,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
__dec_zone_page_state(page, NR_FILE_PAGES);
__dec_zone_page_state(page, NR_SHMEM);
spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
+ put_page(page);
BUG_ON(error);
}
@@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
if (iter.index >= end)
break;
page = radix_tree_deref_slot(slot);
- /*
- * This should only be possible to happen at index 0, so we
- * don't need to reset the counter, nor do we risk infinite
- * restarts.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
if (radix_tree_exceptional_entry(page))
swapped++;
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
@@ -479,10 +474,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+ pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ pgoff_t end = (lend + 1) >> PAGE_SHIFT;
+ unsigned int partial_start = lstart & (PAGE_SIZE - 1);
+ unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
long nr_swaps_freed = 0;
@@ -535,7 +530,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct page *page = NULL;
shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
+ unsigned int top = PAGE_SIZE;
if (start > end) {
top = partial_end;
partial_end = 0;
@@ -543,7 +538,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
zero_user_segment(page, partial_start, top);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (partial_end) {
@@ -553,7 +548,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (start >= end)
@@ -838,7 +833,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return error;
}
@@ -1085,7 +1080,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
if (!newpage)
return -ENOMEM;
- page_cache_get(newpage);
+ get_page(newpage);
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
@@ -1125,8 +1120,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
set_page_private(oldpage, 0);
unlock_page(oldpage);
- page_cache_release(oldpage);
- page_cache_release(oldpage);
+ put_page(oldpage);
+ put_page(oldpage);
return error;
}
@@ -1150,7 +1145,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
int once = 0;
int alloced = 0;
- if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
+ if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
swap.val = 0;
@@ -1161,7 +1156,7 @@ repeat:
}
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
- ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
goto unlock;
}
@@ -1174,7 +1169,7 @@ repeat:
if (sgp != SGP_READ)
goto clear;
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
if (page || (sgp == SGP_READ && !swap.val)) {
@@ -1332,7 +1327,7 @@ clear:
/* Perhaps the file has been truncated since we checked */
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
- ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
ClearPageDirty(page);
delete_from_page_cache(page);
@@ -1360,7 +1355,7 @@ failed:
unlock:
if (page) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
if (error == -ENOSPC && !once++) {
info = SHMEM_I(inode);
@@ -1582,7 +1577,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
- pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
if (unlikely(info->seals)) {
@@ -1606,16 +1601,16 @@ shmem_write_end(struct file *file, struct address_space *mapping,
i_size_write(inode, pos + copied);
if (!PageUptodate(page)) {
- if (copied < PAGE_CACHE_SIZE) {
- unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ if (copied < PAGE_SIZE) {
+ unsigned from = pos & (PAGE_SIZE - 1);
zero_user_segments(page, 0, from,
- from + copied, PAGE_CACHE_SIZE);
+ from + copied, PAGE_SIZE);
}
SetPageUptodate(page);
}
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return copied;
}
@@ -1640,8 +1635,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!iter_is_iovec(to))
sgp = SGP_DIRTY;
- index = *ppos >> PAGE_CACHE_SHIFT;
- offset = *ppos & ~PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_SHIFT;
+ offset = *ppos & ~PAGE_MASK;
for (;;) {
struct page *page = NULL;
@@ -1649,11 +1644,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
break;
if (index == end_index) {
- nr = i_size & ~PAGE_CACHE_MASK;
+ nr = i_size & ~PAGE_MASK;
if (nr <= offset)
break;
}
@@ -1671,14 +1666,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* We must evaluate after, since reads (unlike writes)
* are called without i_mutex protection against truncate
*/
- nr = PAGE_CACHE_SIZE;
+ nr = PAGE_SIZE;
i_size = i_size_read(inode);
- end_index = i_size >> PAGE_CACHE_SHIFT;
+ end_index = i_size >> PAGE_SHIFT;
if (index == end_index) {
- nr = i_size & ~PAGE_CACHE_MASK;
+ nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
if (page)
- page_cache_release(page);
+ put_page(page);
break;
}
}
@@ -1699,7 +1694,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
mark_page_accessed(page);
} else {
page = ZERO_PAGE(0);
- page_cache_get(page);
+ get_page(page);
}
/*
@@ -1709,10 +1704,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = copy_page_to_iter(page, offset, nr, to);
retval += ret;
offset += ret;
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
+ index += offset >> PAGE_SHIFT;
+ offset &= ~PAGE_MASK;
- page_cache_release(page);
+ put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
@@ -1722,7 +1717,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
cond_resched();
}
- *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+ *ppos = ((loff_t) index << PAGE_SHIFT) + offset;
file_accessed(file);
return retval ? retval : error;
}
@@ -1760,9 +1755,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (splice_grow_spd(pipe, &spd))
return -ENOMEM;
- index = *ppos >> PAGE_CACHE_SHIFT;
- loff = *ppos & ~PAGE_CACHE_MASK;
- req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
+ loff = *ppos & ~PAGE_MASK;
+ req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
nr_pages = min(req_pages, spd.nr_pages_max);
spd.nr_pages = find_get_pages_contig(mapping, index,
@@ -1779,7 +1774,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
index++;
}
- index = *ppos >> PAGE_CACHE_SHIFT;
+ index = *ppos >> PAGE_SHIFT;
nr_pages = spd.nr_pages;
spd.nr_pages = 0;
@@ -1789,7 +1784,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (!len)
break;
- this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
page = spd.pages[page_nr];
if (!PageUptodate(page) || page->mapping != mapping) {
@@ -1798,19 +1793,19 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (error)
break;
unlock_page(page);
- page_cache_release(spd.pages[page_nr]);
+ put_page(spd.pages[page_nr]);
spd.pages[page_nr] = page;
}
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ end_index = (isize - 1) >> PAGE_SHIFT;
if (unlikely(!isize || index > end_index))
break;
if (end_index == index) {
unsigned int plen;
- plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ plen = ((isize - 1) & ~PAGE_MASK) + 1;
if (plen <= loff)
break;
@@ -1827,7 +1822,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
}
while (page_nr < nr_pages)
- page_cache_release(spd.pages[page_nr++]);
+ put_page(spd.pages[page_nr++]);
if (spd.nr_pages)
error = splice_to_pipe(pipe, &spd);
@@ -1909,10 +1904,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
else if (offset >= inode->i_size)
offset = -ENXIO;
else {
- start = offset >> PAGE_CACHE_SHIFT;
- end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = offset >> PAGE_SHIFT;
+ end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
new_offset = shmem_seek_hole_data(mapping, start, end, whence);
- new_offset <<= PAGE_CACHE_SHIFT;
+ new_offset <<= PAGE_SHIFT;
if (new_offset > offset) {
if (new_offset < inode->i_size)
offset = new_offset;
@@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
} else if (page_count(page) - page_mapcount(page) > 1) {
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
@@ -1962,8 +1958,7 @@ restart:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
page = NULL;
}
@@ -2032,8 +2028,7 @@ restart:
continue_resched:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2208,8 +2203,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
- start = offset >> PAGE_CACHE_SHIFT;
- end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Try to avoid a swapstorm if len is impossible to satisfy */
if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
error = -ENOSPC;
@@ -2242,8 +2237,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (error) {
/* Remove the !PageUptodate pages we added */
shmem_undo_range(inode,
- (loff_t)start << PAGE_CACHE_SHIFT,
- (loff_t)index << PAGE_CACHE_SHIFT, true);
+ (loff_t)start << PAGE_SHIFT,
+ (loff_t)index << PAGE_SHIFT, true);
goto undone;
}
@@ -2264,7 +2259,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
*/
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
cond_resched();
}
@@ -2285,7 +2280,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
buf->f_type = TMPFS_MAGIC;
- buf->f_bsize = PAGE_CACHE_SIZE;
+ buf->f_bsize = PAGE_SIZE;
buf->f_namelen = NAME_MAX;
if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks;
@@ -2528,7 +2523,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
struct shmem_inode_info *info;
len = strlen(symname) + 1;
- if (len > PAGE_CACHE_SIZE)
+ if (len > PAGE_SIZE)
return -ENAMETOOLONG;
inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
@@ -2567,7 +2562,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
SetPageUptodate(page);
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -2823,9 +2818,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if ((value = strchr(this_char,'=')) != NULL) {
*value++ = 0;
} else {
- printk(KERN_ERR
- "tmpfs: No value for mount option '%s'\n",
- this_char);
+ pr_err("tmpfs: No value for mount option '%s'\n",
+ this_char);
goto error;
}
@@ -2841,7 +2835,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (*rest)
goto bad_val;
sbinfo->max_blocks =
- DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+ DIV_ROUND_UP(size, PAGE_SIZE);
} else if (!strcmp(this_char,"nr_blocks")) {
sbinfo->max_blocks = memparse(value, &rest);
if (*rest)
@@ -2880,8 +2874,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (mpol_parse_str(value, &mpol))
goto bad_val;
} else {
- printk(KERN_ERR "tmpfs: Bad mount option %s\n",
- this_char);
+ pr_err("tmpfs: Bad mount option %s\n", this_char);
goto error;
}
}
@@ -2889,7 +2882,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
return 0;
bad_val:
- printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+ pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
error:
mpol_put(mpol);
@@ -2947,7 +2940,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (sbinfo->max_blocks != shmem_default_max_blocks())
seq_printf(seq, ",size=%luk",
- sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+ sbinfo->max_blocks << (PAGE_SHIFT - 10));
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
@@ -3089,8 +3082,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = TMPFS_MAGIC;
sb->s_op = &shmem_ops;
sb->s_time_gran = 1;
@@ -3286,14 +3279,14 @@ int __init shmem_init(void)
error = register_filesystem(&shmem_fs_type);
if (error) {
- printk(KERN_ERR "Could not register tmpfs\n");
+ pr_err("Could not register tmpfs\n");
goto out2;
}
shm_mnt = kern_mount(&shmem_fs_type);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
- printk(KERN_ERR "Could not kern_mount tmpfs\n");
+ pr_err("Could not kern_mount tmpfs\n");
goto out1;
}
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index 852fc5c79829..17e2848979c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -474,7 +474,7 @@ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
{
- printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ pr_err("slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -670,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return flags;
+ return flags & ~__GFP_NOFAIL;
}
#else /* CONFIG_NUMA */
@@ -841,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
/*
- * Construct gfp mask to allocate from a specific node but do not direct reclaim
- * or warn about failures. kswapd may still wake to reclaim in the background.
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
}
#endif
@@ -1442,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
*/
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ int order = cachep->gfporder;
+ unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, cachep->gfporder);
+ kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1461,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_kmem_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(page, order, cachep);
+ __free_pages(page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1551,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x: ", offset);
+ pr_err("%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
@@ -1564,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit)
if (bad_count == 1) {
error ^= POISON_FREE;
if (!(error & (error - 1))) {
- printk(KERN_ERR "Single bit error detected. Probably "
- "bad RAM.\n");
+ pr_err("Single bit error detected. Probably bad RAM.\n");
#ifdef CONFIG_X86
- printk(KERN_ERR "Run memtest86+ or a similar memory "
- "test tool.\n");
+ pr_err("Run memtest86+ or a similar memory test tool.\n");
#else
- printk(KERN_ERR "Run a memory test tool.\n");
+ pr_err("Run a memory test tool.\n");
#endif
}
}
@@ -1585,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
char *realobj;
if (cachep->flags & SLAB_RED_ZONE) {
- printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ pr_err("Redzone: 0x%llx/0x%llx\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
+ pr_err("Last user: [<%p>](%pSR)\n",
*dbg_userword(cachep, objp),
*dbg_userword(cachep, objp));
}
@@ -1627,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- printk(KERN_ERR
- "Slab corruption (%s): %s start=%p, len=%d\n",
- print_tainted(), cachep->name, realobj, size);
+ pr_err("Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name,
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1656,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
if (objnr) {
objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Prev obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Next obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Next obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1691,11 +1689,9 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "start of a freed object "
- "was overwritten");
+ slab_error(cachep, "start of a freed object was overwritten");
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "end of a freed object "
- "was overwritten");
+ slab_error(cachep, "end of a freed object was overwritten");
}
}
}
@@ -2090,6 +2086,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
}
#endif
+ kasan_cache_create(cachep, &size, &flags);
+
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
@@ -2391,16 +2389,19 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
* cache which they are a constructor for. Otherwise, deadlock.
* They must also be threaded.
*/
- if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+ kasan_unpoison_object_data(cachep,
+ objp + obj_offset(cachep));
cachep->ctor(objp + obj_offset(cachep));
+ kasan_poison_object_data(
+ cachep, objp + obj_offset(cachep));
+ }
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " end of an object");
+ slab_error(cachep, "constructor overwrote the end of an object");
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " start of an object");
+ slab_error(cachep, "constructor overwrote the start of an object");
}
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON) {
@@ -2415,6 +2416,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
struct page *page)
{
int i;
+ void *objp;
cache_init_objs_debug(cachep, page);
@@ -2425,8 +2427,12 @@ static void cache_init_objs(struct kmem_cache *cachep,
for (i = 0; i < cachep->num; i++) {
/* constructor could break poison info */
- if (DEBUG == 0 && cachep->ctor)
- cachep->ctor(index_to_obj(cachep, page, i));
+ if (DEBUG == 0 && cachep->ctor) {
+ objp = index_to_obj(cachep, page, i);
+ kasan_unpoison_object_data(cachep, objp);
+ cachep->ctor(objp);
+ kasan_poison_object_data(cachep, objp);
+ }
set_free_obj(page, i, i);
}
@@ -2467,8 +2473,8 @@ static void slab_put_obj(struct kmem_cache *cachep,
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ pr_err("slab: double free detected in cache '%s', objp %p\n",
+ cachep->name, objp);
BUG();
}
}
@@ -2556,6 +2562,7 @@ static int cache_grow(struct kmem_cache *cachep,
slab_map_pages(cachep, page, freelist);
+ kasan_poison_slab(page);
cache_init_objs(cachep, page);
if (gfpflags_allow_blocking(local_flags))
@@ -2587,7 +2594,7 @@ failed:
static void kfree_debugcheck(const void *objp)
{
if (!virt_addr_valid(objp)) {
- printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+ pr_err("kfree_debugcheck: out of range ptr %lxh\n",
(unsigned long)objp);
BUG();
}
@@ -2611,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
- obj, redzone1, redzone2);
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ obj, redzone1, redzone2);
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
@@ -2899,12 +2906,10 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR
- "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ slab_error(cachep, "double free, or memory outside object was overwritten");
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2915,7 +2920,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
cachep->ctor(objp);
if (ARCH_SLAB_MINALIGN &&
((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
@@ -3324,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
{
struct array_cache *ac = cpu_cache_get(cachep);
+ kasan_slab_free(cachep, objp);
+
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3371,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3436,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
@@ -3459,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3477,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
size, cachep->size,
flags, nodeid);
@@ -3489,11 +3500,15 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
+ void *ret;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ kasan_kmalloc(cachep, ret, size, flags);
+
+ return ret;
}
void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3527,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return cachep;
ret = slab_alloc(cachep, flags, caller);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
@@ -3842,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
return err;
}
@@ -3998,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
name = cachep->name;
if (error)
- printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+ pr_err("slab: cache %s error: %s\n", name, error);
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
@@ -4026,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
- "%4lu %4lu %4lu %4lu %4lu",
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
allocs, high, grown,
reaped, errors, max_freeable, node_allocs,
node_frees, overflows);
@@ -4299,10 +4314,18 @@ module_init(slab_proc_init);
*/
size_t ksize(const void *objp)
{
+ size_t size;
+
BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- return virt_to_cache(objp)->object_size;
+ size = virt_to_cache(objp)->object_size;
+ /* We assume that ksize callers could use the whole allocated area,
+ * so we need to unpoison this area.
+ */
+ kasan_krealloc(objp, size, GFP_NOWAIT);
+
+ return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
index b7934361f026..5969769fbee6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -246,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ int ret;
+
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+
+ ret = __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
+ if (ret)
+ return ret;
+
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ return 0;
+}
+
+static __always_inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ memcg_kmem_uncharge(page, order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -294,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
return 0;
}
+static inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
@@ -379,7 +405,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1,
s->flags, flags);
- kasan_slab_alloc(s, object);
+ kasan_slab_alloc(s, object, flags);
}
memcg_kmem_put_cache(s);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6afb2263a5c5..3239bfd758e6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB)
+ SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_NOTRACK | SLAB_ACCOUNT)
@@ -442,7 +442,7 @@ out_unlock:
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
- printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ pr_warn("kmem_cache_create(%s) failed with error %d\n",
name, err);
dump_stack();
}
@@ -510,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_online(memcg))
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -726,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
err = shutdown_cache(s, &release, &need_rcu_barrier);
if (err) {
- pr_err("kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
+ pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
dump_stack();
}
out_unlock:
@@ -1013,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_kmem_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
- kasan_kmalloc_large(ret, size);
+ kasan_kmalloc_large(ret, size, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1047,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m)
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -1194,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
ks = ksize(p);
if (ks >= new_size) {
- kasan_krealloc((void *)p, new_size);
+ kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
diff --git a/mm/slub.c b/mm/slub.c
index 6c91324f9370..4dbb109eb8cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -254,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return get_freepointer(s, object);
+
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-#else
- p = get_freepointer(s, object);
-#endif
return p;
}
@@ -951,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but "
- "should be %d", page->objects, max_objects);
+ slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+ page->objects, max_objects);
page->objects = max_objects;
slab_fix(s, "Number of objects adjusted.");
}
if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, page->objects - nr);
+ slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+ page->inuse, page->objects - nr);
page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
@@ -1118,8 +1117,8 @@ static inline int free_consistency_checks(struct kmem_cache *s,
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) "
- "outside of slab", object);
+ slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ object);
} else if (!page->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1314,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
- kasan_kmalloc_large(ptr, size);
+ kasan_kmalloc_large(ptr, size, flags);
}
static inline void kfree_hook(const void *x)
@@ -1427,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1540,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_kmem_pages(page, order);
+ memcg_uncharge_slab(page, order, s);
+ __free_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -2596,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2624,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -3182,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
+ kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ GFP_KERNEL);
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3439,10 +3440,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%lu realsize=%u "
- "order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)s->size, s->size,
+ oo_order(s->oo), s->offset, flags);
return -EINVAL;
}
@@ -3562,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3607,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3636,7 +3636,7 @@ size_t ksize(const void *object)
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
so we need unpoison this area. */
- kasan_krealloc(object, size);
+ kasan_krealloc(object, size, GFP_NOWAIT);
return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b60802b3e5ea..68885dcbaf40 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- printk(KERN_WARNING "[%lx-%lx] potential offnode "
- "page_structs\n", start, end - 1);
+ pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
@@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3717ceed4177..5d0cf4540364 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
if (usemap_nid != nid) {
- printk(KERN_INFO
- "node %d must be removed before remove section %ld\n",
- nid, usemap_snr);
+ pr_info("node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
return;
}
/*
@@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
* gather other removable sections for dynamic partitioning.
* Just notify un-removable section's number here.
*/
- printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
- pgdat_snr, nid);
- printk(KERN_CONT
- " have a circular dependency on usemap and pgdat allocations\n");
+ pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
+ usemap_snr, pgdat_snr, nid);
}
#else
static unsigned long * __init
@@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
size * usemap_count);
if (!usemap) {
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ pr_warn("%s: allocation failed\n", __func__);
return;
}
@@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
}
@@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
return NULL;
}
diff --git a/mm/swap.c b/mm/swap.c
index 09fe5e97714a..03aacbcb013f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -114,7 +114,7 @@ void put_pages_list(struct list_head *pages)
victim = list_entry(pages->prev, struct page, lru);
list_del(&victim->lru);
- page_cache_release(victim);
+ put_page(victim);
}
}
EXPORT_SYMBOL(put_pages_list);
@@ -142,7 +142,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
return seg;
pages[seg] = kmap_to_page(kiov[seg].iov_base);
- page_cache_get(pages[seg]);
+ get_page(pages[seg]);
}
return seg;
@@ -236,7 +236,7 @@ void rotate_reclaimable_page(struct page *page)
struct pagevec *pvec;
unsigned long flags;
- page_cache_get(page);
+ get_page(page);
local_irq_save(flags);
pvec = this_cpu_ptr(&lru_rotate_pvecs);
if (!pagevec_add(pvec, page))
@@ -294,7 +294,7 @@ void activate_page(struct page *page)
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
put_cpu_var(activate_page_pvecs);
@@ -389,7 +389,7 @@ static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_space(pvec))
__pagevec_lru_add(pvec);
pagevec_add(pvec, page);
@@ -646,7 +646,7 @@ void deactivate_page(struct page *page)
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
- page_cache_get(page);
+ get_page(page);
if (!pagevec_add(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
put_cpu_var(lru_deactivate_pvecs);
@@ -698,7 +698,7 @@ void lru_add_drain_all(void)
}
/**
- * release_pages - batched page_cache_release()
+ * release_pages - batched put_page()
* @pages: array of pages to release
* @nr: number of pages
* @cold: whether the pages are cache cold
@@ -728,6 +728,11 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ if (is_huge_zero_page(page)) {
+ put_huge_zero_page();
+ continue;
+ }
+
page = compound_head(page);
if (!put_page_testzero(page))
continue;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8dd1..310ac0b8f974 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
return 0;
nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ pr_info("couldn't allocate enough memory for swap_cgroup\n");
+ pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 69cb2464e7dc..366ce3518703 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -85,7 +85,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- page_cache_get(page);
+ get_page(page);
SetPageSwapCache(page);
set_page_private(page, entry.val);
@@ -109,7 +109,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON(error == -EEXIST);
set_page_private(page, 0UL);
ClearPageSwapCache(page);
- page_cache_release(page);
+ put_page(page);
}
return error;
@@ -226,7 +226,7 @@ void delete_from_swap_cache(struct page *page)
spin_unlock_irq(&address_space->tree_lock);
swapcache_free(entry);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -252,7 +252,7 @@ static inline void free_swap_cache(struct page *page)
void free_page_and_swap_cache(struct page *page)
{
free_swap_cache(page);
- page_cache_release(page);
+ put_page(page);
}
/*
@@ -380,7 +380,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
} while (err != -ENOMEM);
if (new_page)
- page_cache_release(new_page);
+ put_page(new_page);
return found_page;
}
@@ -495,7 +495,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
continue;
if (offset != entry_offset)
SetPageReadahead(page);
- page_cache_release(page);
+ put_page(page);
}
blk_finish_plug(&plug);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d2c37365e2d6..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -48,6 +48,12 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
+/*
+ * Some modules use swappable objects and may try to swap them out under
+ * memory pressure (via the shrinker). Before doing so, they may wish to
+ * check to see if any swap space is available.
+ */
+EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
@@ -113,7 +119,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
ret = try_to_free_swap(page);
unlock_page(page);
}
- page_cache_release(page);
+ put_page(page);
return ret;
}
@@ -916,18 +922,19 @@ out:
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
*/
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
{
int count;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
- return 0;
- /* The page is part of THP and cannot be reused */
- if (PageTransCompound(page))
- return 0;
- count = page_mapcount(page);
+ return false;
+ count = page_trans_huge_mapcount(page, total_mapcount);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
if (count == 1 && !PageWriteback(page)) {
@@ -994,7 +1001,7 @@ int free_swap_and_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry),
entry.val);
if (page && !trylock_page(page)) {
- page_cache_release(page);
+ put_page(page);
page = NULL;
}
}
@@ -1011,7 +1018,7 @@ int free_swap_and_cache(swp_entry_t entry)
SetPageDirty(page);
}
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
return p != NULL;
}
@@ -1512,7 +1519,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
}
if (retval) {
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
break;
}
@@ -1564,7 +1571,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
*/
SetPageDirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
/*
* Make sure that we aren't completely killing
@@ -2526,8 +2533,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
- pr_info("Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
@@ -2569,7 +2575,7 @@ bad_swap:
out:
if (page && !IS_ERR(page)) {
kunmap(page);
- page_cache_release(page);
+ put_page(page);
}
if (name)
putname(name);
diff --git a/mm/truncate.c b/mm/truncate.c
index 7598b552ae03..b00272810871 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -118,7 +118,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
return -EIO;
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ do_invalidatepage(page, 0, PAGE_SIZE);
/*
* Some filesystems seem to re-dirty the page even after
@@ -159,8 +159,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
unmap_mapping_range(mapping,
- (loff_t)page->index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)page->index << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
return truncate_complete_page(mapping, page);
}
@@ -241,8 +241,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
return;
/* Offsets within partial pages */
- partial_start = lstart & (PAGE_CACHE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+ partial_start = lstart & (PAGE_SIZE - 1);
+ partial_end = (lend + 1) & (PAGE_SIZE - 1);
/*
* 'start' and 'end' always covers the range of pages to be fully
@@ -250,7 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
* start of the range and 'partial_end' at the end of the range.
* Note that 'end' is exclusive while 'lend' is inclusive.
*/
- start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (lend == -1)
/*
* lend == -1 indicates end-of-file so we have to set 'end'
@@ -259,7 +259,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
*/
end = -1;
else
- end = (lend + 1) >> PAGE_CACHE_SHIFT;
+ end = (lend + 1) >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
index = start;
@@ -298,7 +298,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
- unsigned int top = PAGE_CACHE_SIZE;
+ unsigned int top = PAGE_SIZE;
if (start > end) {
/* Truncation within a single page */
top = partial_end;
@@ -311,7 +311,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
do_invalidatepage(page, partial_start,
top - partial_start);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
if (partial_end) {
@@ -324,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
do_invalidatepage(page, 0,
partial_end);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
}
/*
@@ -538,7 +538,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
- page_cache_release(page); /* pagecache ref */
+ put_page(page); /* pagecache ref */
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -608,18 +608,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
* Zap the rest of the file in one hit.
*/
unmap_mapping_range(mapping,
- (loff_t)index << PAGE_CACHE_SHIFT,
+ (loff_t)index << PAGE_SHIFT,
(loff_t)(1 + end - index)
- << PAGE_CACHE_SHIFT,
- 0);
+ << PAGE_SHIFT,
+ 0);
did_range_unmap = 1;
} else {
/*
* Just zap this page
*/
unmap_mapping_range(mapping,
- (loff_t)index << PAGE_CACHE_SHIFT,
- PAGE_CACHE_SIZE, 0);
+ (loff_t)index << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
}
BUG_ON(page_mapped(page));
@@ -744,14 +744,14 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
WARN_ON(to > inode->i_size);
- if (from >= to || bsize == PAGE_CACHE_SIZE)
+ if (from >= to || bsize == PAGE_SIZE)
return;
/* Page straddling @from will not have any hole block created? */
rounded_from = round_up(from, bsize);
- if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+ if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
return;
- index = from >> PAGE_CACHE_SHIFT;
+ index = from >> PAGE_SHIFT;
page = find_lock_page(inode->i_mapping, index);
/* Page not cached? Nothing to do */
if (!page)
@@ -763,7 +763,7 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
if (page_mkclean(page))
set_page_dirty(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
}
EXPORT_SYMBOL(pagecache_isize_extended);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 806b0c758c5b..af817e5060fb 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -93,7 +93,7 @@ out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
mem_cgroup_cancel_charge(page, memcg, false);
out_release:
- page_cache_release(page);
+ put_page(page);
goto out;
}
@@ -230,8 +230,7 @@ retry:
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
- dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
err = -ENOMEM;
break;
}
@@ -288,7 +287,7 @@ out_unlock:
up_read(&dst_mm->mmap_sem);
out:
if (page)
- page_cache_release(page);
+ put_page(page);
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
diff --git a/mm/util.c b/mm/util.c
index 4fb14ca5a419..6cc81e7b8705 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
- struct mm_struct *mm = current->mm;
- return get_user_pages_unlocked(current, mm, start, nr_pages,
- write, 0, pages);
+ return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -396,6 +394,13 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+int sysctl_overcommit_ratio __read_mostly = 50;
+unsigned long sysctl_overcommit_kbytes __read_mostly;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -437,6 +442,123 @@ unsigned long vm_commit_limit(void)
return allowed;
}
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ long free, allowed, reserve;
+
+ VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+ -(s64)vm_committed_as_batch * num_online_cpus(),
+ "memory commitment underflow");
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (free <= totalreserve_pages)
+ goto error;
+ else
+ free -= totalreserve_pages;
+
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = vm_commit_limit();
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..ae7d20b447ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -469,8 +469,8 @@ overflow:
goto retry;
}
if (printk_ratelimit())
- pr_warn("vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
+ pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+ size);
kfree(va);
return ERR_PTR(-EBUSY);
}
@@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va)
static void vmap_debug_free_range(unsigned long start, unsigned long end)
{
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * Unmap page tables and force a TLB flush immediately if pagealloc
+ * debugging is enabled. This catches use after free bugs similarly to
+ * those in linear kernel virtual address space after a page has been
+ * freed.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
+ * All the lazy freeing logic is still retained, in order to minimise
+ * intrusiveness of this debugging feature.
*
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * This is going to be *slow* (linear kernel virtual address debugging
+ * doesn't do a broadcast TLB flush so it is a lot faster).
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ if (debug_pagealloc_enabled()) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
}
/*
@@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+ BUG_ON(!PAGE_ALIGNED(addr));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dd984470248f..142cb61f4822 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -382,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
*
* @memcg specifies the memory cgroup to target. If it is not NULL,
* only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
*
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
@@ -404,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_online(memcg))
+ if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
if (nr_scanned == 0)
@@ -428,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ /*
+ * If kernel memory accounting is disabled, we ignore
+ * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+ * passing NULL for memcg.
+ */
+ if (memcg_kmem_enabled() &&
+ !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -633,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_freeze_refs(page, 2))
+ if (!page_ref_freeze(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_unfreeze_refs(page, 2);
+ page_ref_unfreeze(page, 2);
goto cannot_free;
}
@@ -699,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_unfreeze_refs(page, 1);
+ page_ref_unfreeze(page, 1);
return 1;
}
return 0;
@@ -2548,7 +2553,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- requested_highidx, sc->nodemask) {
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
enum zone_type classzone_idx;
if (!populated_zone(zone))
@@ -2968,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3029,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3083,10 +3093,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
- struct scan_control *sc,
- unsigned long *nr_attempted)
+ struct scan_control *sc)
{
- int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
@@ -3094,17 +3102,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3118,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
@@ -3136,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3148,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3165,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3183,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
@@ -3220,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, false, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3236,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
- }
-
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
@@ -3297,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc))
raise_priority = false;
}
@@ -3311,49 +3283,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3364,7 +3316,22 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3374,7 +3341,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3387,14 +3355,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- /*
- * Compaction records what page blocks it recently failed to
- * isolate pages from and skips them in the future scanning.
- * When kswapd is going to sleep, it is reasonable to assume
- * that pages and compaction may succeed so reset the cache.
- */
- reset_isolation_suitable(pgdat);
-
if (!kthread_should_stop())
schedule();
@@ -3424,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3457,24 +3416,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3484,7 +3438,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3504,9 +3458,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3536,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 69ce64f7b8d7..5e4300482897 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_daemon_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -847,6 +848,7 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc_failed",
"thp_split_page",
"thp_split_page_failed",
+ "thp_deferred_split_page",
"thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
diff --git a/mm/workingset.c b/mm/workingset.c
index 6130ba0b2641..8a75f8d2916a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -349,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- pages = node_present_pages(sc->nid);
+ if (memcg_kmem_enabled())
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ else
+ pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
+ node_page_state(sc->nid, NR_INACTIVE_FILE);
+
/*
* Active cache pages are limited to 50% of memory, and shadow
* entries that represent a refault distance bigger than that
@@ -458,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2d7c4c11fc63..fe47fbba995a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -281,7 +281,6 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
- bool huge;
};
static int create_handle_cache(struct zs_pool *pool)
@@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void)
debugfs_remove_recursive(zs_stat_root);
}
+static unsigned long zs_can_compact(struct size_class *class);
+
static int zs_stats_size_show(struct seq_file *s, void *v)
{
int i;
@@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
struct size_class *class;
int objs_per_zspage;
unsigned long class_almost_full, class_almost_empty;
- unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
"class", "size", "almost_full", "almost_empty",
"obj_allocated", "obj_used", "pages_used",
- "pages_per_zspage");
+ "pages_per_zspage", "freeable");
for (i = 0; i < zs_size_classes; i++) {
class = pool->size_class[i];
@@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size,
@@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
i, class->size, class_almost_full, class_almost_empty,
obj_allocated, obj_used, pages_used,
- class->pages_per_zspage);
+ class->pages_per_zspage, freeable);
total_class_almost_full += class_almost_full;
total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
+ total_freeable += freeable;
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
"Total", "", total_class_almost_full,
total_class_almost_empty, total_objs,
- total_used_objs, total_pages);
+ total_used_objs, total_pages, "", total_freeable);
return 0;
}
@@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area,
goto out;
buf = area->vm_buf;
- if (!area->huge) {
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
- }
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -1732,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class)
static unsigned long zs_can_compact(struct size_class *class)
{
unsigned long obj_wasted;
+ unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ unsigned long obj_used = zs_stat_get(class, OBJ_USED);
- obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
- zs_stat_get(class, OBJ_USED);
+ if (obj_allocated <= obj_used)
+ return 0;
+ obj_wasted = obj_allocated - obj_used;
obj_wasted /= get_maxobj_per_zspage(class->size,
class->pages_per_zspage);
diff --git a/mm/zswap.c b/mm/zswap.c
index bf14508afd64..de0f119b1780 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -170,6 +170,8 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
static LIST_HEAD(zswap_pools);
/* protects zswap_pools list modification */
static DEFINE_SPINLOCK(zswap_pools_lock);
+/* pool counter to provide unique names to zpool */
+static atomic_t zswap_pools_count = ATOMIC_INIT(0);
/* used by param callback function */
static bool zswap_init_started;
@@ -565,6 +567,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
+ char name[38]; /* 'zswap' + 32 char (max) num + \0 */
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
@@ -573,7 +576,10 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
return NULL;
}
- pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
+ /* unique name for each pool specifically required by zsmalloc */
+ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
+
+ pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
if (!pool->zpool) {
pr_err("%s zpool not available\n", type);
goto error;
@@ -869,7 +875,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_EXIST:
/* page is already in the swap cache, ignore for now */
- page_cache_release(page);
+ put_page(page);
ret = -EEXIST;
goto fail;
@@ -897,7 +903,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
/* start writeback */
__swap_writepage(page, &wbc, end_swap_bio_write);
- page_cache_release(page);
+ put_page(page);
zswap_written_back_pages++;
spin_lock(&tree->lock);
OpenPOWER on IntegriCloud