summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/backing-dev.c15
-rw-r--r--mm/gup.c4
-rw-r--r--mm/hmm.c7
-rw-r--r--mm/huge_memory.c31
-rw-r--r--mm/hugetlb.c40
-rw-r--r--mm/internal.h14
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/ksm.c8
-rw-r--r--mm/memcontrol.c243
-rw-r--r--mm/memory-failure.c46
-rw-r--r--mm/memory.c151
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mempolicy.c5
-rw-r--r--mm/mempool.c1
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mmu_notifier.c19
-rw-r--r--mm/oom_kill.c219
-rw-r--r--mm/page_alloc.c180
-rw-r--r--mm/percpu.c29
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/swap_slots.c8
-rw-r--r--mm/swapfile.c193
-rw-r--r--mm/util.c9
-rw-r--r--mm/vmscan.c5
27 files changed, 752 insertions, 526 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index e5e606ee5f71..9a7b8b049d04 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -46,7 +46,8 @@ config PAGE_POISONING
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
reduce the risk of information leaks from freed data. This does
- have a potential performance impact.
+ have a potential performance impact if enabled with the
+ "page_poison=1" kernel boot option.
Note that "poison" here is not the same thing as the "HWPoison"
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
@@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY
say N.
config PAGE_POISONING_ZERO
- bool "Use zero for poisoning instead of random data"
+ bool "Use zero for poisoning instead of debugging value"
depends on PAGE_POISONING
---help---
Instead of using the existing poison value, fill the pages with
@@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO
allocation.
If unsure, say N
- bool
config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 2e5d3df0853d..f5981e9d6ae2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -438,10 +438,10 @@ retry:
if (new_congested) {
/* !found and storage for new one already allocated, insert */
congested = new_congested;
- new_congested = NULL;
rb_link_node(&congested->rb_node, parent, node);
rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- goto found;
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ return congested;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -451,13 +451,13 @@ retry:
if (!new_congested)
return NULL;
- atomic_set(&new_congested->refcnt, 0);
+ refcount_set(&new_congested->refcnt, 1);
new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
found:
- atomic_inc(&congested->refcnt);
+ refcount_inc(&congested->refcnt);
spin_unlock_irqrestore(&cgwb_lock, flags);
kfree(new_congested);
return congested;
@@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
{
unsigned long flags;
- local_irq_save(flags);
- if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
- local_irq_restore(flags);
+ if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
return;
- }
/* bdi might already have been destroyed leaving @congested unlinked */
if (congested->__bdi) {
@@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
if (!bdi->wb_congested)
return -ENOMEM;
- atomic_set(&bdi->wb_congested->refcnt, 1);
+ refcount_set(&bdi->wb_congested->refcnt, 1);
err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (err) {
diff --git a/mm/gup.c b/mm/gup.c
index fc5f98069f4e..1abc8b4afff6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -497,7 +497,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *nonblocking)
{
unsigned int fault_flags = 0;
- int ret;
+ vm_fault_t ret;
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
@@ -818,7 +818,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- int ret, major = 0;
+ vm_fault_t ret, major = 0;
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
diff --git a/mm/hmm.c b/mm/hmm.c
index 76e7a058b32f..0b0554591610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
up_write(&hmm->mirrors_sem);
}
-static void hmm_invalidate_range_start(struct mmu_notifier *mn,
+static int hmm_invalidate_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
- unsigned long end)
+ unsigned long end,
+ bool blockable)
{
struct hmm *hmm = mm->hmm;
VM_BUG_ON(!hmm);
atomic_inc(&hmm->sequence);
+
+ return 0;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 78427af91de9..08b544383d74 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -541,14 +541,14 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
- gfp_t gfp)
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
+ struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int ret = 0;
+ vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -584,15 +584,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
/* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) {
- int ret;
+ vm_fault_t ret2;
spin_unlock(vmf->ptl);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
- ret = handle_userfault(vmf, VM_UFFD_MISSING);
- VM_BUG_ON(ret & VM_FAULT_FALLBACK);
- return ret;
+ ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
+ VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
+ return ret2;
}
entry = mk_huge_pmd(page, vma->vm_page_prot);
@@ -663,7 +663,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return true;
}
-int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
@@ -682,7 +682,7 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pgtable_t pgtable;
struct page *zero_page;
bool set;
- int ret;
+ vm_fault_t ret;
pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
@@ -1118,15 +1118,16 @@ unlock:
spin_unlock(vmf->ptl);
}
-static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
- struct page *page)
+static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
+ pmd_t orig_pmd, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
struct mem_cgroup *memcg;
pgtable_t pgtable;
pmd_t _pmd;
- int ret = 0, i;
+ int i;
+ vm_fault_t ret = 0;
struct page **pages;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
@@ -1236,7 +1237,7 @@ out_free_pages:
goto out;
}
-int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *new_page;
@@ -1245,7 +1246,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
gfp_t huge_gfp; /* for allocation and charge */
- int ret = 0;
+ vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1457,7 +1458,7 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
{
struct vm_area_struct *vma = vmf->vma;
struct anon_vma *anon_vma = NULL;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 47566bb0b4b1..3c21775f196b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
/*
* Dissolve a given free hugepage into free buddy pages. This function does
* nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
- * number of free hugepages would be reduced below the number of reserved
- * hugepages.
+ * dissolution fails because a give page is not a free hugepage, or because
+ * free hugepages are fully reserved.
*/
int dissolve_free_huge_page(struct page *page)
{
- int rc = 0;
+ int rc = -EBUSY;
spin_lock(&hugetlb_lock);
if (PageHuge(page) && !page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0) {
- rc = -EBUSY;
+ if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
- }
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
@@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page)
h->free_huge_pages_node[nid]--;
h->max_huge_pages--;
update_and_free_page(h, head);
+ rc = 0;
}
out:
spin_unlock(&hugetlb_lock);
@@ -3502,14 +3501,15 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
+static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
struct page *pagecache_page, spinlock_t *ptl)
{
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
- int ret = 0, outside_reserve = 0;
+ int outside_reserve = 0;
+ vm_fault_t ret = 0;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
unsigned long haddr = address & huge_page_mask(h);
@@ -3573,8 +3573,7 @@ retry_avoidcopy:
return 0;
}
- ret = (PTR_ERR(new_page) == -ENOMEM) ?
- VM_FAULT_OOM : VM_FAULT_SIGBUS;
+ ret = vmf_error(PTR_ERR(new_page));
goto out_release_old;
}
@@ -3677,12 +3676,13 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
return 0;
}
-static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
- struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping, pgoff_t idx,
+ unsigned long address, pte_t *ptep, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
- int ret = VM_FAULT_SIGBUS;
+ vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
struct page *page;
@@ -3745,11 +3745,7 @@ retry:
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
- ret = PTR_ERR(page);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -3873,12 +3869,12 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
}
#endif
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
{
pte_t *ptep, entry;
spinlock_t *ptl;
- int ret;
+ vm_fault_t ret;
u32 hash;
pgoff_t idx;
struct page *page = NULL;
@@ -4208,7 +4204,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) &&
!huge_pte_write(huge_ptep_get(pte)))) {
- int ret;
+ vm_fault_t ret;
unsigned int fault_flags = 0;
if (pte)
diff --git a/mm/internal.h b/mm/internal.h
index 9e3654d70289..87256ae1bef8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -38,7 +38,7 @@
void page_writeback_init(void);
-int do_swap_page(struct vm_fault *vmf);
+vm_fault_t do_swap_page(struct vm_fault *vmf);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter,
return iter + 1;
}
-/*
- * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
- * so all functions starting at paging_init should be marked __init
- * in those cases. SPARSEMEM, however, allows for memory hotplug,
- * and alloc_bootmem_node is not used.
- */
-#ifdef CONFIG_SPARSEMEM
-#define __paginginit __meminit
-#else
-#define __paginginit __init
-#endif
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 961cbe9062a5..a31d740e6cd1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -880,7 +880,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
unsigned long address, pmd_t *pmd,
int referenced)
{
- int swapped_in = 0, ret = 0;
+ int swapped_in = 0;
+ vm_fault_t ret = 0;
struct vm_fault vmf = {
.vma = vma,
.address = address,
diff --git a/mm/ksm.c b/mm/ksm.c
index 2621be57bd95..5b0894b45ee5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -652,9 +652,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
* list_head to stay clear from the rb_parent_color union
* (aligned and different than any node) and also different
* from &migrate_nodes. This will verify that future list.h changes
- * don't break STABLE_NODE_DUP_HEAD.
+ * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
*/
-#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+#if defined(GCC_VERSION) && GCC_VERSION >= 40903
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
#endif
@@ -703,7 +703,7 @@ again:
* We cannot do anything with the page while its refcount is 0.
* Usually 0 means free, or tail of a higher-order page: in which
* case this node is no longer referenced, and should be freed;
- * however, it might mean that the page is under page_freeze_refs().
+ * however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
* but if page is swapcache in migrate_page_move_mapping(), it might
* still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ again:
* work here too. We have chosen the !PageSwapCache test to
* optimize the common case, when the page is or is about to
* be freed: PageSwapCache is cleared (under spin_lock_irq)
- * in the freeze_refs section of __remove_mapping(); but Anon
+ * in the ref_freeze section of __remove_mapping(); but Anon
* page->mapping reset to NULL later, in free_pages_prepare().
*/
if (!PageSwapCache(page))
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a921890739f..4ead5a4817de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1777,6 +1777,62 @@ cleanup:
}
/**
+ * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
+ * @victim: task to be killed by the OOM killer
+ * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
+ *
+ * Returns a pointer to a memory cgroup, which has to be cleaned up
+ * by killing all belonging OOM-killable tasks.
+ *
+ * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
+ */
+struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
+ struct mem_cgroup *oom_domain)
+{
+ struct mem_cgroup *oom_group = NULL;
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return NULL;
+
+ if (!oom_domain)
+ oom_domain = root_mem_cgroup;
+
+ rcu_read_lock();
+
+ memcg = mem_cgroup_from_task(victim);
+ if (memcg == root_mem_cgroup)
+ goto out;
+
+ /*
+ * Traverse the memory cgroup hierarchy from the victim task's
+ * cgroup up to the OOMing cgroup (or root) to find the
+ * highest-level memory cgroup with oom.group set.
+ */
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ if (memcg->oom_group)
+ oom_group = memcg;
+
+ if (memcg == oom_domain)
+ break;
+ }
+
+ if (oom_group)
+ css_get(&oom_group->css);
+out:
+ rcu_read_unlock();
+
+ return oom_group;
+}
+
+void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
+{
+ pr_info("Tasks in ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(" are going to be killed due to memory.oom.group set\n");
+}
+
+/**
* lock_page_memcg - lock a page->mem_cgroup binding
* @page: the page
*
@@ -2899,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
- struct mem_cgroup *iter;
- int i;
-
- memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += memcg_page_state(iter, i);
- }
-}
+struct accumulated_stats {
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ unsigned long lru_pages[NR_LRU_LISTS];
+ const unsigned int *stats_array;
+ const unsigned int *events_array;
+ int stats_size;
+ int events_size;
+};
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
+static void accumulate_memcg_tree(struct mem_cgroup *memcg,
+ struct accumulated_stats *acc)
{
- struct mem_cgroup *iter;
+ struct mem_cgroup *mi;
int i;
- memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+ for_each_mem_cgroup_tree(mi, memcg) {
+ for (i = 0; i < acc->stats_size; i++)
+ acc->stat[i] += memcg_page_state(mi,
+ acc->stats_array ? acc->stats_array[i] : i);
- for_each_mem_cgroup_tree(iter, memcg) {
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
- events[i] += memcg_sum_events(iter, i);
+ for (i = 0; i < acc->events_size; i++)
+ acc->events[i] += memcg_sum_events(mi,
+ acc->events_array ? acc->events_array[i] : i);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ acc->lru_pages[i] +=
+ mem_cgroup_nr_lru_pages(mi, BIT(i));
}
}
@@ -3332,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
+ struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3364,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- unsigned long long val = 0;
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = ARRAY_SIZE(memcg1_stats);
+ acc.stats_array = memcg1_stats;
+ acc.events_size = ARRAY_SIZE(memcg1_events);
+ acc.events_array = memcg1_events;
+ accumulate_memcg_tree(memcg, &acc);
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_page_state(mi, memcg1_stats[i]) *
- PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+ (u64)acc.stat[i] * PAGE_SIZE);
}
- for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
- unsigned long long val = 0;
-
- for_each_mem_cgroup_tree(mi, memcg)
- val += memcg_sum_events(mi, memcg1_events[i]);
- seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++) {
- unsigned long long val = 0;
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+ (u64)acc.events[i]);
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -5486,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
+ struct accumulated_stats acc;
int i;
/*
@@ -5501,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
- tree_stat(memcg, stat);
- tree_events(memcg, events);
+ memset(&acc, 0, sizeof(acc));
+ acc.stats_size = MEMCG_NR_STAT;
+ acc.events_size = NR_VM_EVENT_ITEMS;
+ accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n",
- (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
- (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+ (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
- (u64)(stat[NR_SLAB_RECLAIMABLE] +
- stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+ (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
+ acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n",
- (u64)stat[NR_SHMEM] * PAGE_SIZE);
+ (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+ (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++) {
- struct mem_cgroup *mi;
- unsigned long val = 0;
+ (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
- for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_nr_lru_pages(mi, BIT(i));
- seq_printf(m, "%s %llu\n",
- mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
- }
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)acc.lru_pages[i] * PAGE_SIZE);
seq_printf(m, "slab_reclaimable %llu\n",
- (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+ (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+ seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
+ seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
- seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
- events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
- events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+ seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
+ seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
+ acc.events[PGSCAN_DIRECT]);
+ seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
+ acc.events[PGSTEAL_DIRECT]);
+ seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
+ seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
+ seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
+ seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n",
- stat[WORKINGSET_REFAULT]);
+ acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n",
- stat[WORKINGSET_ACTIVATE]);
+ acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n",
- stat[WORKINGSET_NODERECLAIM]);
+ acc.stat[WORKINGSET_NODERECLAIM]);
+
+ return 0;
+}
+
+static int memory_oom_group_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "%d\n", memcg->oom_group);
return 0;
}
+static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret, oom_group;
+
+ buf = strstrip(buf);
+ if (!buf)
+ return -EINVAL;
+
+ ret = kstrtoint(buf, 0, &oom_group);
+ if (ret)
+ return ret;
+
+ if (oom_group != 0 && oom_group != 1)
+ return -EINVAL;
+
+ memcg->oom_group = oom_group;
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5606,6 +5689,12 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
+ {
+ .name = "oom.group",
+ .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
+ .seq_show = memory_oom_group_show,
+ .write = memory_oom_group_write,
+ },
{ } /* terminate */
};
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9b86dc..192d0bbfc9ea 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -57,6 +57,7 @@
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
#include <linux/ratelimit.h>
+#include <linux/page-isolation.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -1167,7 +1168,7 @@ int memory_failure(unsigned long pfn, int flags)
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
* In fact it's dangerous to directly bump up page count from 0,
- * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+ * that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
@@ -1598,8 +1599,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- if (PageHuge(page))
- dissolve_free_huge_page(page);
+ /*
+ * We set PG_hwpoison only when the migration source hugepage
+ * was successfully dissolved, because otherwise hwpoisoned
+ * hugepage remains on free hugepage list, then userspace will
+ * find it as SIGBUS by allocation failure. That's not expected
+ * in soft-offlining.
+ */
+ ret = dissolve_free_huge_page(page);
+ if (!ret) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ }
}
return ret;
}
@@ -1687,6 +1698,7 @@ static int __soft_offline_page(struct page *page, int flags)
static int soft_offline_in_use_page(struct page *page, int flags)
{
int ret;
+ int mt;
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
@@ -1705,23 +1717,37 @@ static int soft_offline_in_use_page(struct page *page, int flags)
put_hwpoison_page(hpage);
}
+ /*
+ * Setting MIGRATE_ISOLATE here ensures that the page will be linked
+ * to free list immediately (not via pcplist) when released after
+ * successful page migration. Otherwise we can't guarantee that the
+ * page is really free after put_page() returns, so
+ * set_hwpoison_free_buddy_page() highly likely fails.
+ */
+ mt = get_pageblock_migratetype(page);
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
-
+ set_pageblock_migratetype(page, mt);
return ret;
}
-static void soft_offline_free_page(struct page *page)
+static int soft_offline_free_page(struct page *page)
{
+ int rc = 0;
struct page *head = compound_head(page);
- if (!TestSetPageHWPoison(head)) {
- num_poisoned_pages_inc();
- if (PageHuge(head))
- dissolve_free_huge_page(page);
+ if (PageHuge(head))
+ rc = dissolve_free_huge_page(page);
+ if (!rc) {
+ if (set_hwpoison_free_buddy_page(page))
+ num_poisoned_pages_inc();
+ else
+ rc = -EBUSY;
}
+ return rc;
}
/**
@@ -1765,7 +1791,7 @@ int soft_offline_page(struct page *page, int flags)
if (ret > 0)
ret = soft_offline_in_use_page(page, flags);
else if (ret == 0)
- soft_offline_free_page(page);
+ ret = soft_offline_free_page(page);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 19f47d7b9b86..83aef222f11b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -238,23 +238,13 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
__tlb_reset_range(tlb);
}
-static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
- if (!tlb->end)
- return;
+ struct mmu_gather_batch *batch;
- tlb_flush(tlb);
- mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
#endif
- __tlb_reset_range(tlb);
-}
-
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
@@ -326,20 +316,31 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-static void tlb_remove_table_smp_sync(void *arg)
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
- struct mm_struct __maybe_unused *mm = arg;
+#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
/*
- * On most architectures this does nothing. Simply delivering the
- * interrupt is enough to prevent races with software page table
- * walking like that done in get_user_pages_fast.
- *
- * See the comment near struct mmu_table_batch.
+ * Invalidate page-table caches used by hardware walkers. Then we still
+ * need to RCU-sched wait while freeing the pages because software
+ * walkers can still be in-flight.
*/
- tlb_flush_remove_tables_local(mm);
+ tlb_flush_mmu_tlbonly(tlb);
+#endif
}
-static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
@@ -348,7 +349,7 @@ static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
* It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch.
*/
- smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
__tlb_remove_table(table);
}
@@ -369,9 +370,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;
- tlb_flush_remove_tables(tlb->mm);
-
if (*batch) {
+ tlb_table_invalidate(tlb);
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL;
}
@@ -381,23 +381,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
struct mmu_table_batch **batch = &tlb->batch;
- /*
- * When there's less then two users of this mm there cannot be a
- * concurrent page-table walk.
- */
- if (atomic_read(&tlb->mm->mm_users) < 2) {
- __tlb_remove_table(table);
- return;
- }
-
if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) {
- tlb_remove_table_one(table, tlb);
+ tlb_table_invalidate(tlb);
+ tlb_remove_table_one(table);
return;
}
(*batch)->nr = 0;
}
+
(*batch)->tables[(*batch)->nr++] = table;
if ((*batch)->nr == MAX_TABLE_BATCH)
tlb_table_flush(tlb);
@@ -2384,9 +2377,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
*
* We do this without the lock held, so that it can sleep if it needs to.
*/
-static int do_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
{
- int ret;
+ vm_fault_t ret;
struct page *page = vmf->page;
unsigned int old_flags = vmf->flags;
@@ -2490,7 +2483,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
* held to the old page, as well as updating the rmap.
* - In any case, unlock the PTL and drop the reference we took to the old page.
*/
-static int wp_page_copy(struct vm_fault *vmf)
+static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -2638,7 +2631,7 @@ oom:
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
*/
-int finish_mkwrite_fault(struct vm_fault *vmf)
+vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@@ -2659,12 +2652,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf)
* Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
* mapping
*/
-static int wp_pfn_shared(struct vm_fault *vmf)
+static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
- int ret;
+ vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
@@ -2677,7 +2670,7 @@ static int wp_pfn_shared(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
-static int wp_page_shared(struct vm_fault *vmf)
+static vm_fault_t wp_page_shared(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
@@ -2685,7 +2678,7 @@ static int wp_page_shared(struct vm_fault *vmf)
get_page(vmf->page);
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
+ vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
@@ -2728,7 +2721,7 @@ static int wp_page_shared(struct vm_fault *vmf)
* but allow concurrent faults), with pte both mapped and locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_wp_page(struct vm_fault *vmf)
+static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
@@ -2904,7 +2897,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
* We return with the mmap_sem locked or unlocked in the same cases
* as does filemap_fault().
*/
-int do_swap_page(struct vm_fault *vmf)
+vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
@@ -2913,7 +2906,7 @@ int do_swap_page(struct vm_fault *vmf)
pte_t pte;
int locked;
int exclusive = 0;
- int ret = 0;
+ vm_fault_t ret = 0;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3124,12 +3117,12 @@ out_release:
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static int do_anonymous_page(struct vm_fault *vmf)
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg;
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
pte_t entry;
/* File mapping without ->vm_ops ? */
@@ -3239,10 +3232,10 @@ oom:
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
-static int __do_fault(struct vm_fault *vmf)
+static vm_fault_t __do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
@@ -3276,7 +3269,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd)
return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
}
-static int pte_alloc_one_map(struct vm_fault *vmf)
+static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3352,13 +3345,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
- int i, ret;
+ int i;
+ vm_fault_t ret;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
@@ -3408,7 +3402,7 @@ out:
return ret;
}
#else
-static int do_set_pmd(struct vm_fault *vmf, struct page *page)
+static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
BUILD_BUG();
return 0;
@@ -3429,13 +3423,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page)
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
*/
-int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
pte_t entry;
- int ret;
+ vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
@@ -3494,10 +3488,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
*/
-int finish_fault(struct vm_fault *vmf)
+vm_fault_t finish_fault(struct vm_fault *vmf)
{
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
/* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) &&
@@ -3583,12 +3577,13 @@ late_initcall(fault_around_debugfs);
* (and therefore to page order). This way it's easier to guarantee
* that we don't cross page table boundaries.
*/
-static int do_fault_around(struct vm_fault *vmf)
+static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
unsigned long address = vmf->address, nr_pages, mask;
pgoff_t start_pgoff = vmf->pgoff;
pgoff_t end_pgoff;
- int off, ret = 0;
+ int off;
+ vm_fault_t ret = 0;
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
@@ -3638,10 +3633,10 @@ out:
return ret;
}
-static int do_read_fault(struct vm_fault *vmf)
+static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret = 0;
+ vm_fault_t ret = 0;
/*
* Let's call ->map_pages() first and use ->fault() as fallback
@@ -3665,10 +3660,10 @@ static int do_read_fault(struct vm_fault *vmf)
return ret;
}
-static int do_cow_fault(struct vm_fault *vmf)
+static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -3704,10 +3699,10 @@ uncharge_out:
return ret;
}
-static int do_shared_fault(struct vm_fault *vmf)
+static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret, tmp;
+ vm_fault_t ret, tmp;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3745,10 +3740,10 @@ static int do_shared_fault(struct vm_fault *vmf)
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_fault(struct vm_fault *vmf)
+static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- int ret;
+ vm_fault_t ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
@@ -3783,7 +3778,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
return mpol_misplaced(page, vma, addr);
}
-static int do_numa_page(struct vm_fault *vmf)
+static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
@@ -3873,7 +3868,7 @@ out:
return 0;
}
-static inline int create_huge_pmd(struct vm_fault *vmf)
+static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_anonymous_page(vmf);
@@ -3883,7 +3878,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf)
}
/* `inline' is required to avoid gcc 4.1.2 build error */
-static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
@@ -3902,7 +3897,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
}
-static int create_huge_pud(struct vm_fault *vmf)
+static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
@@ -3914,7 +3909,7 @@ static int create_huge_pud(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* No support for anonymous transparent PUD pages yet */
@@ -3941,7 +3936,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
-static int handle_pte_fault(struct vm_fault *vmf)
+static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
@@ -4029,8 +4024,8 @@ unlock:
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
+ unsigned long address, unsigned int flags)
{
struct vm_fault vmf = {
.vma = vma,
@@ -4043,7 +4038,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
- int ret;
+ vm_fault_t ret;
pgd = pgd_offset(mm, address);
p4d = p4d_alloc(mm, pgd, address);
@@ -4118,10 +4113,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
- int ret;
+ vm_fault_t ret;
__set_current_state(TASK_RUNNING);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4eb6e824a80c..9eea6e809a4e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat)
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
{
struct pglist_data *pgdat;
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
@@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
/* we can use NODE_DATA(nid) from here */
+ pgdat->node_id = nid;
+ pgdat->node_start_pfn = start_pfn;
+
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+ free_area_init_core_hotplug(nid);
pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
@@ -1017,18 +1018,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
build_all_zonelists(pgdat);
/*
- * zone->managed_pages is set to an approximate value in
- * free_area_init_core(), which will cause
- * /sys/device/system/node/nodeX/meminfo has wrong data.
- * So reset it to 0 before any memory is onlined.
- */
- reset_node_managed_pages(pgdat);
-
- /*
* When memory is hot-added, all the memory is in offline state. So
* clear all zones' present_pages because they will be updated in
* online_pages() and offline_pages().
*/
+ reset_node_managed_pages(pgdat);
reset_node_present_pages(pgdat);
return pgdat;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 01f1a14facc4..da858f794eb6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void)
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
- return z->zone ? z->zone->node : node;
+ return z->zone ? zone_to_nid(z->zone) : node;
}
default:
@@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes);
- polnid = z->zone->node;
+ polnid = zone_to_nid(z->zone);
break;
default:
@@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
goto put_new;
/* Create pseudo-vma that contains just the policy */
- memset(&pvma, 0, sizeof(struct vm_area_struct));
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
diff --git a/mm/mempool.c b/mm/mempool.c
index 44f5fa98c1e7..0ef8cc8d1602 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node);
/**
* mempool_init - initialize a memory pool
+ * @pool: pointer to the memory pool that should be initialized
* @min_nr: the minimum number of elements guaranteed to be
* allocated for this pool.
* @alloc_fn: user-defined element-allocation function.
diff --git a/mm/migrate.c b/mm/migrate.c
index 4a83268e23c2..d6a2e89b086a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1131,7 +1131,8 @@ out:
* gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
* around it.
*/
-#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#if defined(CONFIG_ARM) && \
+ defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
#define ICE_noinline noinline
#else
#define ICE_noinline
@@ -1211,7 +1212,7 @@ out:
* intentionally. Although it's rather weird,
* it's how HWPoison flag works at the moment.
*/
- if (!test_set_page_hwpoison(page))
+ if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
}
} else {
@@ -1330,8 +1331,6 @@ put_anon:
out:
if (rc != -EAGAIN)
putback_active_hugepage(hpage);
- if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage))
- num_poisoned_pages_inc();
/*
* If migration was not successful and there's a freeing callback, use
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5b72266b4b03..6838a530789b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void)
zone->name);
/* Iterate the zonelist */
- for_each_zone_zonelist(zone, z, zonelist, zoneid) {
-#ifdef CONFIG_NUMA
- pr_cont("%d:%s ", zone->node, zone->name);
-#else
- pr_cont("0:%s ", zone->name);
-#endif /* CONFIG_NUMA */
- }
+ for_each_zone_zonelist(zone, z, zonelist, zoneid)
+ pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
pr_cont("\n");
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 8d6449e74431..5f2b2b184c60 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm)
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
- mutex_lock(&oom_lock);
- __oom_reap_task_mm(mm);
- mutex_unlock(&oom_lock);
+ (void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
down_write(&mm->mmap_sem);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index eff6b88a993f..82bb1a939c0e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ bool blockable)
{
struct mmu_notifier *mn;
+ int ret = 0;
int id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_range_start)
- mn->ops->invalidate_range_start(mn, mm, start, end);
+ if (mn->ops->invalidate_range_start) {
+ int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+ if (_ret) {
+ pr_info("%pS callback failed with %d in %sblockable context.\n",
+ mn->ops->invalidate_range_start, _ret,
+ !blockable ? "non-" : "");
+ ret = _ret;
+ }
+ }
}
srcu_read_unlock(&srcu, id);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c74dcc2d26d..b5b25e4dcbbb 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
+ pr_info("Tasks state (memory values in pages):\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
mm_pgtables_bytes(task->mm),
@@ -487,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-void __oom_reap_task_mm(struct mm_struct *mm)
+bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ bool ret = true;
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -519,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm)
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, mm, start, end);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
+ ret = false;
+ continue;
+ }
unmap_page_range(&tlb, vma, start, end, NULL);
mmu_notifier_invalidate_range_end(mm, start, end);
tlb_finish_mmu(&tlb, start, end);
}
}
+
+ return ret;
}
+/*
+ * Reaps the address space of the give task.
+ *
+ * Returns true on success and false if none or part of the address space
+ * has been reclaimed and the caller should retry later.
+ */
static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- /*
- * We have to make sure to not race with the victim exit path
- * and cause premature new oom victim selection:
- * oom_reap_task_mm exit_mm
- * mmget_not_zero
- * mmput
- * atomic_dec_and_test
- * exit_oom_victim
- * [...]
- * out_of_memory
- * select_bad_process
- * # no TIF_MEMDIE task selects new victim
- * unmap_page_range # frees some memory
- */
- mutex_lock(&oom_lock);
-
if (!down_read_trylock(&mm->mmap_sem)) {
- ret = false;
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
- }
-
- /*
- * If the mm has invalidate_{start,end}() notifiers that could block,
- * sleep to give the oom victim some more time.
- * TODO: we really want to get rid of this ugly hack and make sure that
- * notifiers cannot block for unbounded amount of time
- */
- if (mm_has_blockable_invalidate_notifiers(mm)) {
- up_read(&mm->mmap_sem);
- schedule_timeout_idle(HZ);
- goto unlock_oom;
+ return false;
}
/*
@@ -572,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* down_write();up_write() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
- up_read(&mm->mmap_sem);
trace_skip_task_reaping(tsk->pid);
- goto unlock_oom;
+ goto out_unlock;
}
trace_start_task_reaping(tsk->pid);
- __oom_reap_task_mm(mm);
+ /* failed to reap part of the address space. Try again later */
+ ret = __oom_reap_task_mm(mm);
+ if (!ret)
+ goto out_finish;
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)));
+out_finish:
+ trace_finish_task_reaping(tsk->pid);
+out_unlock:
up_read(&mm->mmap_sem);
- trace_finish_task_reaping(tsk->pid);
-unlock_oom:
- mutex_unlock(&oom_lock);
return ret;
}
@@ -843,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task)
return ret;
}
-static void oom_kill_process(struct oom_control *oc, const char *message)
+static void __oom_kill_process(struct task_struct *victim)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *p;
struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
bool can_oom_reap = true;
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just give it access to memory reserves
- * so it can die quickly
- */
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
- return;
- }
- task_unlock(p);
-
- if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
-
p = find_lock_task_mm(victim);
if (!p) {
put_task_struct(victim);
@@ -979,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
#undef K
/*
+ * Kill provided task unless it's secured by setting
+ * oom_score_adj to OOM_SCORE_ADJ_MIN.
+ */
+static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+{
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(task);
+ __oom_kill_process(task);
+ }
+ return 0;
+}
+
+static void oom_kill_process(struct oom_control *oc, const char *message)
+{
+ struct task_struct *p = oc->chosen;
+ unsigned int points = oc->chosen_points;
+ struct task_struct *victim = p;
+ struct task_struct *child;
+ struct task_struct *t;
+ struct mem_cgroup *oom_group;
+ unsigned int victim_points = 0;
+ static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ /*
+ * If the task is already exiting, don't alarm the sysadmin or kill
+ * its children or threads, just give it access to memory reserves
+ * so it can die quickly
+ */
+ task_lock(p);
+ if (task_will_free_mem(p)) {
+ mark_oom_victim(p);
+ wake_oom_reaper(p);
+ task_unlock(p);
+ put_task_struct(p);
+ return;
+ }
+ task_unlock(p);
+
+ if (__ratelimit(&oom_rs))
+ dump_header(oc, p);
+
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
+ message, task_pid_nr(p), p->comm, points);
+
+ /*
+ * If any of p's children has a different mm and is eligible for kill,
+ * the one with the highest oom_badness() score is sacrificed for its
+ * parent. This attempts to lose the minimal amount of work done while
+ * still freeing memory.
+ */
+ read_lock(&tasklist_lock);
+ for_each_thread(p, t) {
+ list_for_each_entry(child, &t->children, sibling) {
+ unsigned int child_points;
+
+ if (process_shares_mm(child, p->mm))
+ continue;
+ /*
+ * oom_badness() returns 0 if the thread is unkillable
+ */
+ child_points = oom_badness(child,
+ oc->memcg, oc->nodemask, oc->totalpages);
+ if (child_points > victim_points) {
+ put_task_struct(victim);
+ victim = child;
+ victim_points = child_points;
+ get_task_struct(victim);
+ }
+ }
+ }
+ read_unlock(&tasklist_lock);
+
+ /*
+ * Do we need to kill the entire memory cgroup?
+ * Or even one of the ancestor memory cgroups?
+ * Check this out before killing the victim task.
+ */
+ oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
+
+ __oom_kill_process(victim);
+
+ /*
+ * If necessary, kill all tasks in the selected memory cgroup.
+ */
+ if (oom_group) {
+ mem_cgroup_print_oom_group(oom_group);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_put(oom_group);
+ }
+}
+
+/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(struct oom_control *oc,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 15ea511fb41c..e75865d58ba7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
if (!static_branch_likely(&vm_numa_stat_key))
return;
- if (z->node != numa_node_id())
+ if (zone_to_nid(z) != numa_node_id())
local_stat = NUMA_OTHER;
- if (z->node == preferred_zone->node)
+ if (zone_to_nid(z) == zone_to_nid(preferred_zone))
__inc_numa_state(z, NUMA_HIT);
else {
__inc_numa_state(z, NUMA_MISS);
@@ -5278,7 +5278,7 @@ int local_memory_node(int node)
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
- return z->zone->node;
+ return zone_to_nid(z->zone);
}
#endif
@@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
return usemapsize / 8;
}
-static void __init setup_usemap(struct pglist_data *pgdat,
+static void __ref setup_usemap(struct pglist_data *pgdat,
struct zone *zone,
unsigned long zone_start_pfn,
unsigned long zonesize)
@@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-void __paginginit set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
-static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
- unsigned long present_pages)
+static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
+ unsigned long present_pages)
{
unsigned long pages = spanned_pages;
@@ -6194,39 +6194,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
}
-/*
- * Set up the zone data structures:
- * - mark all pages reserved
- * - mark all memory queues empty
- * - clear the memory bitmaps
- *
- * NOTE: pgdat should get zeroed by caller.
- */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat)
-{
- enum zone_type j;
- int nid = pgdat->node_id;
-
- pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
+static void pgdat_init_numabalancing(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
+}
+#else
+static void pgdat_init_numabalancing(struct pglist_data *pgdat) {}
#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pgdat_init_split_queue(struct pglist_data *pgdat)
+{
spin_lock_init(&pgdat->split_queue_lock);
INIT_LIST_HEAD(&pgdat->split_queue);
pgdat->split_queue_len = 0;
+}
+#else
+static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
#endif
- init_waitqueue_head(&pgdat->kswapd_wait);
- init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
#ifdef CONFIG_COMPACTION
+static void pgdat_init_kcompactd(struct pglist_data *pgdat)
+{
init_waitqueue_head(&pgdat->kcompactd_wait);
+}
+#else
+static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
#endif
+
+static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
+{
+ pgdat_resize_init(pgdat);
+
+ pgdat_init_numabalancing(pgdat);
+ pgdat_init_split_queue(pgdat);
+ pgdat_init_kcompactd(pgdat);
+
+ init_waitqueue_head(&pgdat->kswapd_wait);
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
+
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+}
+
+static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
+ unsigned long remaining_pages)
+{
+ zone->managed_pages = remaining_pages;
+ zone_set_nid(zone, nid);
+ zone->name = zone_names[idx];
+ zone->zone_pgdat = NODE_DATA(nid);
+ spin_lock_init(&zone->lock);
+ zone_seqlock_init(zone);
+ zone_pcp_init(zone);
+}
+
+/*
+ * Set up the zone data structures
+ * - init pgdat internals
+ * - init all zones belonging to this node
+ *
+ * NOTE: this function is only called during memory hotplug
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __ref free_area_init_core_hotplug(int nid)
+{
+ enum zone_type z;
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ pgdat_init_internals(pgdat);
+ for (z = 0; z < MAX_NR_ZONES; z++)
+ zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
+}
+#endif
+
+/*
+ * Set up the zone data structures:
+ * - mark all pages reserved
+ * - mark all memory queues empty
+ * - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
+ * NOTE: this function is only called during early init.
+ */
+static void __init free_area_init_core(struct pglist_data *pgdat)
+{
+ enum zone_type j;
+ int nid = pgdat->node_id;
+ pgdat_init_internals(pgdat);
pgdat->per_cpu_nodestats = &boot_nodestats;
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -6274,15 +6334,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = freesize;
-#ifdef CONFIG_NUMA
- zone->node = nid;
-#endif
- zone->name = zone_names[j];
- zone->zone_pgdat = pgdat;
- spin_lock_init(&zone->lock);
- zone_seqlock_init(zone);
- zone_pcp_init(zone);
+ zone_init_internals(zone, j, nid, freesize);
if (!size)
continue;
@@ -6342,8 +6394,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
-void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn, unsigned long *zholes_size)
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
+{
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
+ pgdat->node_spanned_pages);
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+#else
+static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
+#endif
+
+void __init free_area_init_node(int nid, unsigned long *zones_size,
+ unsigned long node_start_pfn,
+ unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
@@ -6367,16 +6435,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
+ pgdat_set_deferred_range(pgdat);
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
- pgdat->first_deferred_pfn = ULONG_MAX;
-#endif
free_area_init_core(pgdat);
}
@@ -6388,7 +6448,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
*/
-void __paginginit zero_resv_unavail(void)
+void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
unsigned long pfn;
@@ -8036,3 +8096,33 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
+
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
+ * test is performed under the zone lock to prevent a race against page
+ * allocation.
+ */
+bool set_hwpoison_free_buddy_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ unsigned int order;
+ bool hwpoisoned = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct page *page_head = page - (pfn & ((1 << order) - 1));
+
+ if (PageBuddy(page_head) && page_order(page_head) >= order) {
+ if (!TestSetPageHWPoison(page))
+ hwpoisoned = true;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return hwpoisoned;
+}
+#endif
diff --git a/mm/percpu.c b/mm/percpu.c
index 0b6480979ac7..a749d4d96e3e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks);
int pcpu_nr_empty_pop_pages;
/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
+
+/*
* Balance work is used to populate or destroy chunks asynchronously. We
* try to keep the number of populated free pages between
* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
@@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
+ pcpu_nr_populated += nr;
if (!for_alloc) {
chunk->nr_empty_pop_pages += nr;
@@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
chunk->nr_populated -= nr;
chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
+ pcpu_nr_populated -= nr;
}
/*
@@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ /* include all regions of the first chunk */
+ pcpu_nr_populated += PFN_DOWN(size_sum);
+
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(base_addr);
@@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void)
#endif /* CONFIG_SMP */
/*
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks. Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata. It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
+ */
+unsigned long pcpu_nr_pages(void)
+{
+ return pcpu_nr_populated * pcpu_nr_units;
+}
+
+/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.
diff --git a/mm/shmem.c b/mm/shmem.c
index c48c79018a7c..0376c124b043 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -124,7 +124,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, int *fault_type);
+ struct vm_fault *vmf, vm_fault_t *fault_type);
int shmem_getpage(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp)
@@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
- memset(vma, 0, sizeof(*vma));
vma_init(vma, NULL);
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
@@ -1621,7 +1620,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type)
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 008ccb22fee6..63a7b4563a57 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
- cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
- cache->slots);
+ cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
+ cache->slots, 1);
return cache->nr;
}
@@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, true, &entry);
+ get_swap_pages(1, &entry, HPAGE_PMD_NR);
goto out;
}
@@ -350,7 +350,7 @@ repeat:
goto out;
}
- get_swap_pages(1, false, &entry);
+ get_swap_pages(1, &entry, 1);
out:
if (mem_cgroup_try_charge_swap(page, entry)) {
put_swap_page(page, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8837b22c848d..d954b71c4f9c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER HPAGE_PMD_NR
+
+#define swap_entry_size(size) (size)
#else
#define SWAPFILE_CLUSTER 256
+
+/*
+ * Define swap_entry_size() as constant to let compiler to optimize
+ * out some code if !CONFIG_THP_SWAP
+ */
+#define swap_entry_size(size) 1
#endif
#define LATENCY_LIMIT 256
@@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
static inline bool cluster_is_huge(struct swap_cluster_info *info)
{
- return info->flags & CLUSTER_FLAG_HUGE;
+ if (IS_ENABLED(CONFIG_THP_SWAP))
+ return info->flags & CLUSTER_FLAG_HUGE;
+ return false;
}
static inline void cluster_clear_huge(struct swap_cluster_info *info)
@@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci)
spin_unlock(&ci->lock);
}
+/*
+ * Determine the locking method in use for this device. Return
+ * swap_cluster_info if SSD-style cluster-based locking is in place.
+ */
static inline struct swap_cluster_info *lock_cluster_or_swap_info(
- struct swap_info_struct *si,
- unsigned long offset)
+ struct swap_info_struct *si, unsigned long offset)
{
struct swap_cluster_info *ci;
+ /* Try to use fine-grained SSD-style locking if available: */
ci = lock_cluster(si, offset);
+ /* Otherwise, fall back to traditional, coarse locking: */
if (!ci)
spin_lock(&si->lock);
@@ -863,7 +878,6 @@ no_page:
return n_ret;
}
-#ifdef CONFIG_THP_SWAP
static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
@@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
unsigned long offset, i;
unsigned char *map;
+ /*
+ * Should not even be attempting cluster allocations when huge
+ * page swap is disabled. Warn and fail the allocation.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP)) {
+ VM_WARN_ON_ONCE(1);
+ return 0;
+ }
+
if (cluster_list_empty(&si->free_clusters))
return 0;
@@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
unlock_cluster(ci);
swap_range_free(si, offset, SWAPFILE_CLUSTER);
}
-#else
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
- VM_WARN_ON_ONCE(1);
- return 0;
-}
-#endif /* CONFIG_THP_SWAP */
static unsigned long scan_swap_map(struct swap_info_struct *si,
unsigned char usage)
@@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
}
-int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
{
- unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
+ unsigned long size = swap_entry_size(entry_size);
struct swap_info_struct *si, *next;
long avail_pgs;
int n_ret = 0;
int node;
/* Only single cluster request supported */
- WARN_ON_ONCE(n_goal > 1 && cluster);
+ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
- avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
+ avail_pgs = atomic_long_read(&nr_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
@@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
if (n_goal > avail_pgs)
n_goal = avail_pgs;
- atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
+ atomic_long_sub(n_goal * size, &nr_swap_pages);
spin_lock(&swap_avail_lock);
@@ -972,14 +988,14 @@ start_over:
spin_unlock(&si->lock);
goto nextsi;
}
- if (cluster) {
+ if (size == SWAPFILE_CLUSTER) {
if (!(si->flags & SWP_FILE))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
n_goal, swp_entries);
spin_unlock(&si->lock);
- if (n_ret || cluster)
+ if (n_ret || size == SWAPFILE_CLUSTER)
goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type);
@@ -1005,7 +1021,7 @@ nextsi:
check_out:
if (n_ret < n_goal)
- atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+ atomic_long_add((long)(n_goal - n_ret) * size,
&nr_swap_pages);
noswap:
return n_ret;
@@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+ unsigned long offset,
+ unsigned char usage)
{
- struct swap_cluster_info *ci;
- unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
- ci = lock_cluster_or_swap_info(p, offset);
-
count = p->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
@@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p,
usage = count | has_cache;
p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ return usage;
+}
+
+static unsigned char __swap_entry_free(struct swap_info_struct *p,
+ swp_entry_t entry, unsigned char usage)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+
+ ci = lock_cluster_or_swap_info(p, offset);
+ usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
return usage;
@@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-static void swapcache_free(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
- }
-}
-
-#ifdef CONFIG_THP_SWAP
-static void swapcache_free_cluster(swp_entry_t entry)
+void put_swap_page(struct page *page, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
+ int size = swap_entry_size(hpage_nr_pages(page));
si = _swap_info_get(entry);
if (!si)
return;
- ci = lock_cluster(si, offset);
- VM_BUG_ON(!cluster_is_huge(ci));
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- val = map[i];
- VM_BUG_ON(!(val & SWAP_HAS_CACHE));
- if (val == SWAP_HAS_CACHE)
- free_entries++;
- }
- if (!free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++)
- map[i] &= ~SWAP_HAS_CACHE;
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (size == SWAPFILE_CLUSTER) {
+ VM_BUG_ON(!cluster_is_huge(ci));
+ map = si->swap_map + offset;
+ for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+ val = map[i];
+ VM_BUG_ON(!(val & SWAP_HAS_CACHE));
+ if (val == SWAP_HAS_CACHE)
+ free_entries++;
+ }
+ cluster_clear_huge(ci);
+ if (free_entries == SWAPFILE_CLUSTER) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ ci = lock_cluster(si, offset);
+ memset(map, 0, SWAPFILE_CLUSTER);
+ unlock_cluster(ci);
+ mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+ swap_free_cluster(si, idx);
+ spin_unlock(&si->lock);
+ return;
+ }
}
- cluster_clear_huge(ci);
- unlock_cluster(ci);
- if (free_entries == SWAPFILE_CLUSTER) {
- spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
- } else if (free_entries) {
- for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) {
- if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE))
- free_swap_slot(entry);
+ for (i = 0; i < size; i++, entry.val++) {
+ if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
+ unlock_cluster_or_swap_info(si, ci);
+ free_swap_slot(entry);
+ if (i == size - 1)
+ return;
+ lock_cluster_or_swap_info(si, offset);
}
}
+ unlock_cluster_or_swap_info(si, ci);
}
+#ifdef CONFIG_THP_SWAP
int split_swap_cluster(swp_entry_t entry)
{
struct swap_info_struct *si;
@@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry)
unlock_cluster(ci);
return 0;
}
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif /* CONFIG_THP_SWAP */
-
-void put_swap_page(struct page *page, swp_entry_t entry)
-{
- if (!PageTransHuge(page))
- swapcache_free(entry);
- else
- swapcache_free_cluster(entry);
-}
+#endif
static int swp_entry_cmp(const void *ent1, const void *ent2)
{
@@ -1409,7 +1415,6 @@ out:
return count;
}
-#ifdef CONFIG_THP_SWAP
static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
swp_entry_t entry)
{
@@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
ci = lock_cluster_or_swap_info(si, offset);
if (!ci || !cluster_is_huge(ci)) {
- if (map[roffset] != SWAP_HAS_CACHE)
+ if (swap_count(map[roffset]))
ret = true;
goto unlock_out;
}
for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- if (map[offset + i] != SWAP_HAS_CACHE) {
+ if (swap_count(map[offset + i])) {
ret = true;
break;
}
@@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page)
swp_entry_t entry;
struct swap_info_struct *si;
- if (likely(!PageTransCompound(page)))
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
return page_swapcount(page) != 0;
page = compound_head(page);
@@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
+ mapcount = page_trans_huge_mapcount(page, total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
@@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
return map_swapcount;
}
-#else
-#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry)
-#define page_swapped(page) (page_swapcount(page) != 0)
-
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
- int *total_swapcount)
-{
- int mapcount, swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return mapcount + swapcount;
-}
-#endif
/*
* We can write to an anon page without COW if there are no other references
diff --git a/mm/util.c b/mm/util.c
index 3351659200e6..d2890a407332 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -196,7 +196,7 @@ void *vmemdup_user(const void __user *src, size_t len)
}
EXPORT_SYMBOL(vmemdup_user);
-/*
+/**
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
@@ -434,6 +434,13 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
}
EXPORT_SYMBOL(kvmalloc_node);
+/**
+ * kvfree - free memory allocated with kvmalloc
+ * @addr: pointer returned by kvmalloc
+ *
+ * If the memory is allocated from vmalloc area it is freed with vfree().
+ * Otherwise kfree() is used.
+ */
void kvfree(const void *addr)
{
if (is_vmalloc_addr(addr))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4375b1e9bd56..7e7d25504651 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
#ifdef CONFIG_MEMCG_KMEM
- idr_replace(&shrinker_idr, shrinker, shrinker->id);
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+ idr_replace(&shrinker_idr, shrinker, shrinker->id);
#endif
up_write(&shrinker_rwsem);
}
@@ -902,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+ /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
OpenPOWER on IntegriCloud