diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 15 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/hmm.c | 7 | ||||
-rw-r--r-- | mm/huge_memory.c | 31 | ||||
-rw-r--r-- | mm/hugetlb.c | 40 | ||||
-rw-r--r-- | mm/internal.h | 14 | ||||
-rw-r--r-- | mm/khugepaged.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 243 | ||||
-rw-r--r-- | mm/memory-failure.c | 46 | ||||
-rw-r--r-- | mm/memory.c | 151 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 5 | ||||
-rw-r--r-- | mm/mempool.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 7 | ||||
-rw-r--r-- | mm/mm_init.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 19 | ||||
-rw-r--r-- | mm/oom_kill.c | 219 | ||||
-rw-r--r-- | mm/page_alloc.c | 180 | ||||
-rw-r--r-- | mm/percpu.c | 29 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/swap_slots.c | 8 | ||||
-rw-r--r-- | mm/swapfile.c | 193 | ||||
-rw-r--r-- | mm/util.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 5 |
27 files changed, 752 insertions, 526 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e5e606ee5f71..9a7b8b049d04 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -46,7 +46,8 @@ config PAGE_POISONING Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps reduce the risk of information leaks from freed data. This does - have a potential performance impact. + have a potential performance impact if enabled with the + "page_poison=1" kernel boot option. Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. @@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY say N. config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of random data" + bool "Use zero for poisoning instead of debugging value" depends on PAGE_POISONING ---help--- Instead of using the existing poison value, fill the pages with @@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO allocation. If unsure, say N - bool config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2e5d3df0853d..f5981e9d6ae2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -438,10 +438,10 @@ retry: if (new_congested) { /* !found and storage for new one already allocated, insert */ congested = new_congested; - new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - goto found; + spin_unlock_irqrestore(&cgwb_lock, flags); + return congested; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -451,13 +451,13 @@ retry: if (!new_congested) return NULL; - atomic_set(&new_congested->refcnt, 0); + refcount_set(&new_congested->refcnt, 1); new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; found: - atomic_inc(&congested->refcnt); + refcount_inc(&congested->refcnt); spin_unlock_irqrestore(&cgwb_lock, flags); kfree(new_congested); return congested; @@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested) { unsigned long flags; - local_irq_save(flags); - if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { - local_irq_restore(flags); + if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) return; - } /* bdi might already have been destroyed leaving @congested unlinked */ if (congested->__bdi) { @@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; - atomic_set(&bdi->wb_congested->refcnt, 1); + refcount_set(&bdi->wb_congested->refcnt, 1); err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { @@ -497,7 +497,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { unsigned int fault_flags = 0; - int ret; + vm_fault_t ret; /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) @@ -818,7 +818,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - int ret, major = 0; + vm_fault_t ret, major = 0; if (unlocked) fault_flags |= FAULT_FLAG_ALLOW_RETRY; @@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) up_write(&hmm->mirrors_sem); } -static void hmm_invalidate_range_start(struct mmu_notifier *mn, +static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); atomic_inc(&hmm->sequence); + + return 0; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 78427af91de9..08b544383d74 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -541,14 +541,14 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, - gfp_t gfp) +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, + struct page *page, gfp_t gfp) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int ret = 0; + vm_fault_t ret = 0; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -584,15 +584,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, /* Deliver the page fault to userland */ if (userfaultfd_missing(vma)) { - int ret; + vm_fault_t ret2; spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(vmf, VM_UFFD_MISSING); - VM_BUG_ON(ret & VM_FAULT_FALLBACK); - return ret; + ret2 = handle_userfault(vmf, VM_UFFD_MISSING); + VM_BUG_ON(ret2 & VM_FAULT_FALLBACK); + return ret2; } entry = mk_huge_pmd(page, vma->vm_page_prot); @@ -663,7 +663,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct vm_fault *vmf) +vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; @@ -682,7 +682,7 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) pgtable_t pgtable; struct page *zero_page; bool set; - int ret; + vm_fault_t ret; pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -1118,15 +1118,16 @@ unlock: spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, - struct page *page) +static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, + pmd_t orig_pmd, struct page *page) { struct vm_area_struct *vma = vmf->vma; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; - int ret = 0, i; + int i; + vm_fault_t ret = 0; struct page **pages; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1236,7 +1237,7 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) +vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; @@ -1245,7 +1246,7 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ - int ret = 0; + vm_fault_t ret = 0; vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); @@ -1457,7 +1458,7 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) +vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 47566bb0b4b1..3c21775f196b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1479,22 +1479,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, /* * Dissolve a given free hugepage into free buddy pages. This function does * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the - * number of free hugepages would be reduced below the number of reserved - * hugepages. + * dissolution fails because a give page is not a free hugepage, or because + * free hugepages are fully reserved. */ int dissolve_free_huge_page(struct page *page) { - int rc = 0; + int rc = -EBUSY; spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); int nid = page_to_nid(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) { - rc = -EBUSY; + if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; - } /* * Move PageHWPoison flag from head page to the raw error page, * which makes any subpages rather than the error page reusable. @@ -1508,6 +1506,7 @@ int dissolve_free_huge_page(struct page *page) h->free_huge_pages_node[nid]--; h->max_huge_pages--; update_and_free_page(h, head); + rc = 0; } out: spin_unlock(&hugetlb_lock); @@ -3502,14 +3501,15 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, +static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, struct page *pagecache_page, spinlock_t *ptl) { pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int ret = 0, outside_reserve = 0; + int outside_reserve = 0; + vm_fault_t ret = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ unsigned long haddr = address & huge_page_mask(h); @@ -3573,8 +3573,7 @@ retry_avoidcopy: return 0; } - ret = (PTR_ERR(new_page) == -ENOMEM) ? - VM_FAULT_OOM : VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(new_page)); goto out_release_old; } @@ -3677,12 +3676,13 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } -static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); - int ret = VM_FAULT_SIGBUS; + vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct page *page; @@ -3745,11 +3745,7 @@ retry: page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { - ret = PTR_ERR(page); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -3873,12 +3869,12 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, } #endif -int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, +vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pte_t *ptep, entry; spinlock_t *ptl; - int ret; + vm_fault_t ret; u32 hash; pgoff_t idx; struct page *page = NULL; @@ -4208,7 +4204,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !huge_pte_write(huge_ptep_get(pte)))) { - int ret; + vm_fault_t ret; unsigned int fault_flags = 0; if (pte) diff --git a/mm/internal.h b/mm/internal.h index 9e3654d70289..87256ae1bef8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,7 +38,7 @@ void page_writeback_init(void); -int do_swap_page(struct vm_fault *vmf); +vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter, return iter + 1; } -/* - * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, - * so all functions starting at paging_init should be marked __init - * in those cases. SPARSEMEM, however, allows for memory hotplug, - * and alloc_bootmem_node is not used. - */ -#ifdef CONFIG_SPARSEMEM -#define __paginginit __meminit -#else -#define __paginginit __init -#endif - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 961cbe9062a5..a31d740e6cd1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -880,7 +880,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - int swapped_in = 0, ret = 0; + int swapped_in = 0; + vm_fault_t ret = 0; struct vm_fault vmf = { .vma = vma, .address = address, @@ -652,9 +652,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) * list_head to stay clear from the rb_parent_color union * (aligned and different than any node) and also different * from &migrate_nodes. This will verify that future list.h changes - * don't break STABLE_NODE_DUP_HEAD. + * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it. */ -#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ +#if defined(GCC_VERSION) && GCC_VERSION >= 40903 BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); #endif @@ -703,7 +703,7 @@ again: * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; - * however, it might mean that the page is under page_freeze_refs(). + * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * but if page is swapcache in migrate_page_move_mapping(), it might * still be our page, in which case it's essential to keep the node. @@ -714,7 +714,7 @@ again: * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the freeze_refs section of __remove_mapping(); but Anon + * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a921890739f..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1777,6 +1777,62 @@ cleanup: } /** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + +/** * lock_page_memcg - lock a page->mem_cgroup binding * @page: the page * @@ -2899,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ - struct mem_cgroup *iter; - int i; - - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += memcg_page_state(iter, i); - } -} +struct accumulated_stats { + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long lru_pages[NR_LRU_LISTS]; + const unsigned int *stats_array; + const unsigned int *events_array; + int stats_size; + int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, + struct accumulated_stats *acc) { - struct mem_cgroup *iter; + struct mem_cgroup *mi; int i; - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); + for_each_mem_cgroup_tree(mi, memcg) { + for (i = 0; i < acc->stats_size; i++) + acc->stat[i] += memcg_page_state(mi, + acc->stats_array ? acc->stats_array[i] : i); - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += memcg_sum_events(iter, i); + for (i = 0; i < acc->events_size; i++) + acc->events[i] += memcg_sum_events(mi, + acc->events_array ? acc->events_array[i] : i); + + for (i = 0; i < NR_LRU_LISTS; i++) + acc->lru_pages[i] += + mem_cgroup_nr_lru_pages(mi, BIT(i)); } } @@ -3332,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3364,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - unsigned long long val = 0; + memset(&acc, 0, sizeof(acc)); + acc.stats_size = ARRAY_SIZE(memcg1_stats); + acc.stats_array = memcg1_stats; + acc.events_size = ARRAY_SIZE(memcg1_events); + acc.events_array = memcg1_events; + accumulate_memcg_tree(memcg, &acc); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_page_state(mi, memcg1_stats[i]) * - PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)acc.stat[i] * PAGE_SIZE); } - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { - unsigned long long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_sum_events(mi, memcg1_events[i]); - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); - } - - for (i = 0; i < NR_LRU_LISTS; i++) { - unsigned long long val = 0; + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], + (u64)acc.events[i]); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -5486,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + struct accumulated_stats acc; int i; /* @@ -5501,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - tree_stat(memcg, stat); - tree_events(memcg, events); + memset(&acc, 0, sizeof(acc)); + acc.stats_size = MEMCG_NR_STAT; + acc.events_size = NR_VM_EVENT_ITEMS; + accumulate_memcg_tree(memcg, &acc); seq_printf(m, "anon %llu\n", - (u64)stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[NR_SLAB_RECLAIMABLE] + - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[NR_WRITEBACK] * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) { - struct mem_cgroup *mi; - unsigned long val = 0; + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)); - seq_printf(m, "%s %llu\n", - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + - events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + - events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + + acc.events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + + acc.events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); seq_printf(m, "workingset_refault %lu\n", - stat[WORKINGSET_REFAULT]); + acc.stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - stat[WORKINGSET_ACTIVATE]); + acc.stat[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - stat[WORKINGSET_NODERECLAIM]); + acc.stat[WORKINGSET_NODERECLAIM]); + + return 0; +} + +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%d\n", memcg->oom_group); return 0; } +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5606,6 +5689,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { } /* terminate */ }; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d142b9b86dc..192d0bbfc9ea 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -57,6 +57,7 @@ #include <linux/mm_inline.h> #include <linux/kfifo.h> #include <linux/ratelimit.h> +#include <linux/page-isolation.h> #include "internal.h" #include "ras/ras_event.h" @@ -1167,7 +1168,7 @@ int memory_failure(unsigned long pfn, int flags) * R/W the page; let's pray that the page has been * used and will be freed some time later. * In fact it's dangerous to directly bump up page count from 0, - * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { @@ -1598,8 +1599,18 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - if (PageHuge(page)) - dissolve_free_huge_page(page); + /* + * We set PG_hwpoison only when the migration source hugepage + * was successfully dissolved, because otherwise hwpoisoned + * hugepage remains on free hugepage list, then userspace will + * find it as SIGBUS by allocation failure. That's not expected + * in soft-offlining. + */ + ret = dissolve_free_huge_page(page); + if (!ret) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + } } return ret; } @@ -1687,6 +1698,7 @@ static int __soft_offline_page(struct page *page, int flags) static int soft_offline_in_use_page(struct page *page, int flags) { int ret; + int mt; struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) { @@ -1705,23 +1717,37 @@ static int soft_offline_in_use_page(struct page *page, int flags) put_hwpoison_page(hpage); } + /* + * Setting MIGRATE_ISOLATE here ensures that the page will be linked + * to free list immediately (not via pcplist) when released after + * successful page migration. Otherwise we can't guarantee that the + * page is really free after put_page() returns, so + * set_hwpoison_free_buddy_page() highly likely fails. + */ + mt = get_pageblock_migratetype(page); + set_pageblock_migratetype(page, MIGRATE_ISOLATE); if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - + set_pageblock_migratetype(page, mt); return ret; } -static void soft_offline_free_page(struct page *page) +static int soft_offline_free_page(struct page *page) { + int rc = 0; struct page *head = compound_head(page); - if (!TestSetPageHWPoison(head)) { - num_poisoned_pages_inc(); - if (PageHuge(head)) - dissolve_free_huge_page(page); + if (PageHuge(head)) + rc = dissolve_free_huge_page(page); + if (!rc) { + if (set_hwpoison_free_buddy_page(page)) + num_poisoned_pages_inc(); + else + rc = -EBUSY; } + return rc; } /** @@ -1765,7 +1791,7 @@ int soft_offline_page(struct page *page, int flags) if (ret > 0) ret = soft_offline_in_use_page(page, flags); else if (ret == 0) - soft_offline_free_page(page); + ret = soft_offline_free_page(page); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 19f47d7b9b86..83aef222f11b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -238,23 +238,13 @@ void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, __tlb_reset_range(tlb); } -static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) +static void tlb_flush_mmu_free(struct mmu_gather *tlb) { - if (!tlb->end) - return; + struct mmu_gather_batch *batch; - tlb_flush(tlb); - mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif - __tlb_reset_range(tlb); -} - -static void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - struct mmu_gather_batch *batch; - for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -326,20 +316,31 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ #ifdef CONFIG_HAVE_RCU_TABLE_FREE -static void tlb_remove_table_smp_sync(void *arg) +/* + * See the comment near struct mmu_table_batch. + */ + +/* + * If we want tlb_remove_table() to imply TLB invalidates. + */ +static inline void tlb_table_invalidate(struct mmu_gather *tlb) { - struct mm_struct __maybe_unused *mm = arg; +#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE /* - * On most architectures this does nothing. Simply delivering the - * interrupt is enough to prevent races with software page table - * walking like that done in get_user_pages_fast. - * - * See the comment near struct mmu_table_batch. + * Invalidate page-table caches used by hardware walkers. Then we still + * need to RCU-sched wait while freeing the pages because software + * walkers can still be in-flight. */ - tlb_flush_remove_tables_local(mm); + tlb_flush_mmu_tlbonly(tlb); +#endif } -static void tlb_remove_table_one(void *table, struct mmu_gather *tlb) +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -348,7 +349,7 @@ static void tlb_remove_table_one(void *table, struct mmu_gather *tlb) * It is however sufficient for software page-table walkers that rely on * IRQ disabling. See the comment near struct mmu_table_batch. */ - smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1); + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); __tlb_remove_table(table); } @@ -369,9 +370,8 @@ void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; - tlb_flush_remove_tables(tlb->mm); - if (*batch) { + tlb_table_invalidate(tlb); call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } @@ -381,23 +381,16 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; - /* - * When there's less then two users of this mm there cannot be a - * concurrent page-table walk. - */ - if (atomic_read(&tlb->mm->mm_users) < 2) { - __tlb_remove_table(table); - return; - } - if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { - tlb_remove_table_one(table, tlb); + tlb_table_invalidate(tlb); + tlb_remove_table_one(table); return; } (*batch)->nr = 0; } + (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); @@ -2384,9 +2377,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static int do_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { - int ret; + vm_fault_t ret; struct page *page = vmf->page; unsigned int old_flags = vmf->flags; @@ -2490,7 +2483,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf) +static vm_fault_t wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -2638,7 +2631,7 @@ oom: * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). */ -int finish_mkwrite_fault(struct vm_fault *vmf) +vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -2659,12 +2652,12 @@ int finish_mkwrite_fault(struct vm_fault *vmf) * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct vm_fault *vmf) +static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - int ret; + vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; @@ -2677,7 +2670,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) return VM_FAULT_WRITE; } -static int wp_page_shared(struct vm_fault *vmf) +static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2685,7 +2678,7 @@ static int wp_page_shared(struct vm_fault *vmf) get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; + vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); @@ -2728,7 +2721,7 @@ static int wp_page_shared(struct vm_fault *vmf) * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct vm_fault *vmf) +static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2904,7 +2897,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct vm_fault *vmf) +vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; @@ -2913,7 +2906,7 @@ int do_swap_page(struct vm_fault *vmf) pte_t pte; int locked; int exclusive = 0; - int ret = 0; + vm_fault_t ret = 0; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; @@ -3124,12 +3117,12 @@ out_release: * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct vm_fault *vmf) +static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; - int ret = 0; + vm_fault_t ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ @@ -3239,10 +3232,10 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct vm_fault *vmf) +static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | @@ -3276,7 +3269,7 @@ static int pmd_devmap_trans_unstable(pmd_t *pmd) return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } -static int pte_alloc_one_map(struct vm_fault *vmf) +static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3352,13 +3345,14 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; - int i, ret; + int i; + vm_fault_t ret; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; @@ -3408,7 +3402,7 @@ out: return ret; } #else -static int do_set_pmd(struct vm_fault *vmf, struct page *page) +static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3429,13 +3423,13 @@ static int do_set_pmd(struct vm_fault *vmf, struct page *page) * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; - int ret; + vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { @@ -3494,10 +3488,10 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). */ -int finish_fault(struct vm_fault *vmf) +vm_fault_t finish_fault(struct vm_fault *vmf) { struct page *page; - int ret = 0; + vm_fault_t ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && @@ -3583,12 +3577,13 @@ late_initcall(fault_around_debugfs); * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ -static int do_fault_around(struct vm_fault *vmf) +static vm_fault_t do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; - int off, ret = 0; + int off; + vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; @@ -3638,10 +3633,10 @@ out: return ret; } -static int do_read_fault(struct vm_fault *vmf) +static vm_fault_t do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret = 0; + vm_fault_t ret = 0; /* * Let's call ->map_pages() first and use ->fault() as fallback @@ -3665,10 +3660,10 @@ static int do_read_fault(struct vm_fault *vmf) return ret; } -static int do_cow_fault(struct vm_fault *vmf) +static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -3704,10 +3699,10 @@ uncharge_out: return ret; } -static int do_shared_fault(struct vm_fault *vmf) +static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret, tmp; + vm_fault_t ret, tmp; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3745,10 +3740,10 @@ static int do_shared_fault(struct vm_fault *vmf) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct vm_fault *vmf) +static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - int ret; + vm_fault_t ret; /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) @@ -3783,7 +3778,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct vm_fault *vmf) +static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; @@ -3873,7 +3868,7 @@ out: return 0; } -static inline int create_huge_pmd(struct vm_fault *vmf) +static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); @@ -3883,7 +3878,7 @@ static inline int create_huge_pmd(struct vm_fault *vmf) } /* `inline' is required to avoid gcc 4.1.2 build error */ -static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) +static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_wp_page(vmf, orig_pmd); @@ -3902,7 +3897,7 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } -static int create_huge_pud(struct vm_fault *vmf) +static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3914,7 +3909,7 @@ static int create_huge_pud(struct vm_fault *vmf) return VM_FAULT_FALLBACK; } -static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* No support for anonymous transparent PUD pages yet */ @@ -3941,7 +3936,7 @@ static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct vm_fault *vmf) +static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; @@ -4029,8 +4024,8 @@ unlock: * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags) +static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) { struct vm_fault vmf = { .vma = vma, @@ -4043,7 +4038,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; p4d_t *p4d; - int ret; + vm_fault_t ret; pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); @@ -4118,10 +4113,10 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, +vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - int ret; + vm_fault_t ret; __set_current_state(TASK_RUNNING); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4eb6e824a80c..9eea6e809a4e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat) static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); @@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, zones_size, start_pfn, zholes_size); + free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* @@ -1017,18 +1018,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat); /* - * zone->managed_pages is set to an approximate value in - * free_area_init_core(), which will cause - * /sys/device/system/node/nodeX/meminfo has wrong data. - * So reset it to 0 before any memory is onlined. - */ - reset_node_managed_pages(pgdat); - - /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ + reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..da858f794eb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); - return z->zone ? z->zone->node : node; + return z->zone ? zone_to_nid(z->zone) : node; } default: @@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); - polnid = z->zone->node; + polnid = zone_to_nid(z->zone); break; default: @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/mempool.c b/mm/mempool.c index 44f5fa98c1e7..0ef8cc8d1602 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. diff --git a/mm/migrate.c b/mm/migrate.c index 4a83268e23c2..d6a2e89b086a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1131,7 +1131,8 @@ out: * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work * around it. */ -#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#if defined(CONFIG_ARM) && \ + defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 #define ICE_noinline noinline #else #define ICE_noinline @@ -1211,7 +1212,7 @@ out: * intentionally. Although it's rather weird, * it's how HWPoison flag works at the moment. */ - if (!test_set_page_hwpoison(page)) + if (set_hwpoison_free_buddy_page(page)) num_poisoned_pages_inc(); } } else { @@ -1330,8 +1331,6 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) - num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..6838a530789b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void) zone->name); /* Iterate the zonelist */ - for_each_zone_zonelist(zone, z, zonelist, zoneid) { -#ifdef CONFIG_NUMA - pr_cont("%d:%s ", zone->node, zone->name); -#else - pr_cont("0:%s ", zone->name); -#endif /* CONFIG_NUMA */ - } + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); pr_cont("\n"); } } diff --git a/mm/mmap.c b/mm/mmap.c index 8d6449e74431..5f2b2b184c60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm) * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ - mutex_lock(&oom_lock); - __oom_reap_task_mm(mm); - mutex_unlock(&oom_lock); + (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index eff6b88a993f..82bb1a939c0e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable) { struct mmu_notifier *mn; + int ret = 0; int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + if (mn->ops->invalidate_range_start) { + int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + mn->ops->invalidate_range_start, _ret, + !blockable ? "non-" : ""); + ret = _ret; + } + } } srcu_read_unlock(&srcu, id); + + return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c74dcc2d26d..b5b25e4dcbbb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), mm_pgtables_bytes(task->mm), @@ -487,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -void __oom_reap_task_mm(struct mm_struct *mm) +bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + bool ret = true; /* * Tell all users of get_user/copy_from_user etc... that the content @@ -519,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); + if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + ret = false; + continue; + } unmap_page_range(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } } + + return ret; } +/* + * Reaps the address space of the give task. + * + * Returns true on success and false if none or part of the address space + * has been reclaimed and the caller should retry later. + */ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { bool ret = true; - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); - if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; - } - - /* - * If the mm has invalidate_{start,end}() notifiers that could block, - * sleep to give the oom victim some more time. - * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time - */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; + return false; } /* @@ -572,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * down_write();up_write() cycle in exit_mmap(). */ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { - up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + goto out_unlock; } trace_start_task_reaping(tsk->pid); - __oom_reap_task_mm(mm); + /* failed to reap part of the address space. Try again later */ + ret = __oom_reap_task_mm(mm); + if (!ret) + goto out_finish; pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES))); +out_finish: + trace_finish_task_reaping(tsk->pid); +out_unlock: up_read(&mm->mmap_sem); - trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); return ret; } @@ -843,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; - /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly - */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); - return; - } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); - p = find_lock_task_mm(victim); if (!p) { put_task_struct(victim); @@ -979,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message) #undef K /* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(task); + __oom_kill_process(task); + } + return 0; +} + +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + struct mem_cgroup *oom_group; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + + __oom_kill_process(victim); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_put(oom_group); + } +} + +/* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(struct oom_control *oc, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15ea511fb41c..e75865d58ba7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) if (!static_branch_likely(&vm_numa_stat_key)) return; - if (z->node != numa_node_id()) + if (zone_to_nid(z) != numa_node_id()) local_stat = NUMA_OTHER; - if (z->node == preferred_zone->node) + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) __inc_numa_state(z, NUMA_HIT); else { __inc_numa_state(z, NUMA_MISS); @@ -5278,7 +5278,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return z->zone->node; + return zone_to_nid(z->zone); } #endif @@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __init setup_usemap(struct pglist_data *pgdat, +static void __ref setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) @@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6194,39 +6194,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat) -{ - enum zone_type j; - int nid = pgdat->node_id; - - pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING +static void pgdat_init_numabalancing(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; +} +#else +static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pgdat_init_split_queue(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->split_queue_lock); INIT_LIST_HEAD(&pgdat->split_queue); pgdat->split_queue_len = 0; +} +#else +static void pgdat_init_split_queue(struct pglist_data *pgdat) {} #endif - init_waitqueue_head(&pgdat->kswapd_wait); - init_waitqueue_head(&pgdat->pfmemalloc_wait); + #ifdef CONFIG_COMPACTION +static void pgdat_init_kcompactd(struct pglist_data *pgdat) +{ init_waitqueue_head(&pgdat->kcompactd_wait); +} +#else +static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} #endif + +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) +{ + pgdat_resize_init(pgdat); + + pgdat_init_numabalancing(pgdat); + pgdat_init_split_queue(pgdat); + pgdat_init_kcompactd(pgdat); + + init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + zone->managed_pages = remaining_pages; + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(int nid) +{ + enum zone_type z; + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat_init_internals(pgdat); + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + pgdat_init_internals(pgdat); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6274,15 +6334,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = freesize; -#ifdef CONFIG_NUMA - zone->node = nid; -#endif - zone->name = zone_names[j]; - zone->zone_pgdat = pgdat; - spin_lock_init(&zone->lock); - zone_seqlock_init(zone); - zone_pcp_init(zone); + zone_init_internals(zone, j, nid, freesize); if (!size) continue; @@ -6342,8 +6394,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLAT_NODE_MEM_MAP */ -void __paginginit free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) +{ + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +} +#else +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} +#endif + +void __init free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6367,16 +6435,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); + pgdat_set_deferred_range(pgdat); -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -#endif free_area_init_core(pgdat); } @@ -6388,7 +6448,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __paginginit zero_resv_unavail(void) +void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; @@ -8036,3 +8096,33 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } + +#ifdef CONFIG_MEMORY_FAILURE +/* + * Set PG_hwpoison flag if a given page is confirmed to be a free page. This + * test is performed under the zone lock to prevent a race against page + * allocation. + */ +bool set_hwpoison_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + unsigned int order; + bool hwpoisoned = false; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) { + if (!TestSetPageHWPoison(page)) + hwpoisoned = true; + break; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return hwpoisoned; +} +#endif diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); int pcpu_nr_empty_pop_pages; /* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + +/* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ /* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + +/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..0376c124b043 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -124,7 +124,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, int *fault_type); + struct vm_fault *vmf, vm_fault_t *fault_type); int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp) @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; @@ -1621,7 +1620,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, int *fault_type) + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 008ccb22fee6..63a7b4563a57 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, - cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); return cache->nr; } @@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, true, &entry); + get_swap_pages(1, &entry, HPAGE_PMD_NR); goto out; } @@ -350,7 +350,7 @@ repeat: goto out; } - get_swap_pages(1, false, &entry); + get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 8837b22c848d..d954b71c4f9c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR + +#define swap_entry_size(size) (size) #else #define SWAPFILE_CLUSTER 256 + +/* + * Define swap_entry_size() as constant to let compiler to optimize + * out some code if !CONFIG_THP_SWAP + */ +#define swap_entry_size(size) 1 #endif #define LATENCY_LIMIT 256 @@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info) static inline bool cluster_is_huge(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_HUGE; + if (IS_ENABLED(CONFIG_THP_SWAP)) + return info->flags & CLUSTER_FLAG_HUGE; + return false; } static inline void cluster_clear_huge(struct swap_cluster_info *info) @@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +/* + * Determine the locking method in use for this device. Return + * swap_cluster_info if SSD-style cluster-based locking is in place. + */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, - unsigned long offset) + struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; + /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); + /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); @@ -863,7 +878,6 @@ no_page: return n_ret; } -#ifdef CONFIG_THP_SWAP static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; @@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) unsigned long offset, i; unsigned char *map; + /* + * Should not even be attempting cluster allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) { + VM_WARN_ON_ONCE(1); + return 0; + } + if (cluster_list_empty(&si->free_clusters)) return 0; @@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } -#else -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - VM_WARN_ON_ONCE(1); - return 0; -} -#endif /* CONFIG_THP_SWAP */ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { - unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; + unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && cluster); + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; + avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) goto noswap; @@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); + atomic_long_sub(n_goal * size, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -972,14 +988,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) { + if (size == SWAPFILE_CLUSTER) { if (!(si->flags & SWP_FILE)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret || cluster) + if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -1005,7 +1021,7 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * nr_pages, + atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; @@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, + unsigned long offset, + unsigned char usage) { - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; - ci = lock_cluster_or_swap_info(p, offset); - count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; @@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, usage = count | has_cache; p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + return usage; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); return usage; @@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -static void swapcache_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); - } -} - -#ifdef CONFIG_THP_SWAP -static void swapcache_free_cluster(swp_entry_t entry) +void put_swap_page(struct page *page, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; + int size = swap_entry_size(hpage_nr_pages(page)); si = _swap_info_get(entry); if (!si) return; - ci = lock_cluster(si, offset); - VM_BUG_ON(!cluster_is_huge(ci)); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (!free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] &= ~SWAP_HAS_CACHE; + ci = lock_cluster_or_swap_info(si, offset); + if (size == SWAPFILE_CLUSTER) { + VM_BUG_ON(!cluster_is_huge(ci)); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + cluster_clear_huge(ci); + if (free_entries == SWAPFILE_CLUSTER) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + return; + } } - cluster_clear_huge(ci); - unlock_cluster(ci); - if (free_entries == SWAPFILE_CLUSTER) { - spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - } else if (free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { - if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); + for (i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); + if (i == size - 1) + return; + lock_cluster_or_swap_info(si, offset); } } + unlock_cluster_or_swap_info(si, ci); } +#ifdef CONFIG_THP_SWAP int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; @@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry) unlock_cluster(ci); return 0; } -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif /* CONFIG_THP_SWAP */ - -void put_swap_page(struct page *page, swp_entry_t entry) -{ - if (!PageTransHuge(page)) - swapcache_free(entry); - else - swapcache_free_cluster(entry); -} +#endif static int swp_entry_cmp(const void *ent1, const void *ent2) { @@ -1409,7 +1415,6 @@ out: return count; } -#ifdef CONFIG_THP_SWAP static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { @@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { - if (map[roffset] != SWAP_HAS_CACHE) + if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { - if (map[offset + i] != SWAP_HAS_CACHE) { + if (swap_count(map[offset + i])) { ret = true; break; } @@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page) swp_entry_t entry; struct swap_info_struct *si; - if (likely(!PageTransCompound(page))) + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) return page_swapcount(page) != 0; page = compound_head(page); @@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { + mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) @@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, return map_swapcount; } -#else -#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) -#define page_swapped(page) (page_swapcount(page) != 0) - -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, - int *total_swapcount) -{ - int mapcount, swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - mapcount = page_trans_huge_mapcount(page, total_mapcount); - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return mapcount + swapcount; -} -#endif /* * We can write to an anon page without COW if there are no other references diff --git a/mm/util.c b/mm/util.c index 3351659200e6..d2890a407332 100644 --- a/mm/util.c +++ b/mm/util.c @@ -196,7 +196,7 @@ void *vmemdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(vmemdup_user); -/* +/** * strndup_user - duplicate an existing string from user space * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. @@ -434,6 +434,13 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) } EXPORT_SYMBOL(kvmalloc_node); +/** + * kvfree - free memory allocated with kvmalloc + * @addr: pointer returned by kvmalloc + * + * If the memory is allocated from vmalloc area it is freed with vfree(). + * Otherwise kfree() is used. + */ void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4375b1e9bd56..7e7d25504651 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); #ifdef CONFIG_MEMCG_KMEM - idr_replace(&shrinker_idr, shrinker, shrinker->id); + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + idr_replace(&shrinker_idr, shrinker, shrinker->id); #endif up_write(&shrinker_rwsem); } @@ -902,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, refcount = 2; if (!page_ref_freeze(page, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, refcount); goto cannot_free; |