diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 83 | ||||
-rw-r--r-- | mm/filemap.c | 8 | ||||
-rw-r--r-- | mm/frame_vector.c | 5 | ||||
-rw-r--r-- | mm/gup.c | 25 | ||||
-rw-r--r-- | mm/huge_memory.c | 36 | ||||
-rw-r--r-- | mm/hugetlb.c | 5 | ||||
-rw-r--r-- | mm/internal.h | 12 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 8 | ||||
-rw-r--r-- | mm/khugepaged.c | 11 | ||||
-rw-r--r-- | mm/kmemcheck.c | 2 | ||||
-rw-r--r-- | mm/ksm.c | 3 | ||||
-rw-r--r-- | mm/memblock.c | 23 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 20 | ||||
-rw-r--r-- | mm/memory.c | 78 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 160 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 199 | ||||
-rw-r--r-- | mm/page_isolation.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/slab.h | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 6 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 18 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 3 | ||||
-rw-r--r-- | mm/swap_slots.c | 19 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 10 | ||||
-rw-r--r-- | mm/truncate.c | 21 | ||||
-rw-r--r-- | mm/util.c | 61 | ||||
-rw-r--r-- | mm/vmalloc.c | 30 | ||||
-rw-r--r-- | mm/vmpressure.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 38 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
37 files changed, 604 insertions, 344 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 09c5282ebdd2..613c59e928cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) list_splice(&tmp_list, list); } -static inline bool migrate_async_suitable(int migratetype) -{ - return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; -} - #ifdef CONFIG_COMPACTION int PageMovable(struct page *page) @@ -988,6 +983,22 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION +static bool suitable_migration_source(struct compact_control *cc, + struct page *page) +{ + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) + return true; + + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; +} + /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct compact_control *cc, struct page *page) @@ -1007,7 +1018,7 @@ static bool suitable_migration_target(struct compact_control *cc, return true; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) + if (is_migrate_movable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(get_pageblock_migratetype(page))) + if (!suitable_migration_source(cc, page)) continue; /* Perform the isolation */ @@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result __compact_finished(struct zone *zone, + struct compact_control *cc) { unsigned int order; - unsigned long watermark; + const int migratetype = cc->migratetype; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - /* Compaction run is not finished if the watermark is not met */ - watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; - - if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, - cc->alloc_flags)) - return COMPACT_CONTINUE; + if (cc->finishing_block) { + /* + * We have finished the pageblock, but better check again that + * we really succeeded. + */ + if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + cc->finishing_block = false; + else + return COMPACT_CONTINUE; + } /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { @@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - return COMPACT_SUCCESS; + true, &can_steal) != -1) { + + /* movable pages are OK in any pageblock */ + if (migratetype == MIGRATE_MOVABLE) + return COMPACT_SUCCESS; + + /* + * We are stealing for a non-movable allocation. Make + * sure we finish compacting the current pageblock + * first so it is as free as possible and we won't + * have to steal another one soon. This only applies + * to sync compaction, as async compaction operates + * on pageblocks of the same migratetype. + */ + if (cc->mode == MIGRATE_ASYNC || + IS_ALIGNED(cc->migrate_pfn, + pageblock_nr_pages)) { + return COMPACT_SUCCESS; + } + + cc->finishing_block = true; + return COMPACT_CONTINUE; + } } return COMPACT_NO_SUITABLE_PAGE; } static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc, - const int migratetype) + struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc, migratetype); + ret = __compact_finished(zone, cc); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); - const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ @@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc, migratetype)) == - COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { diff --git a/mm/filemap.c b/mm/filemap.c index 681da61080bc..6f1be573a5e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2050,7 +2050,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += retval; count -= retval; } - iov_iter_revert(iter, iov_iter_count(iter) - count); + iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter @@ -2791,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file, ssize_t written = 0; unsigned int flags = 0; - /* - * Copies from kernel address space cannot fail (NFSD is a big user). - */ - if (!iter_is_iovec(i)) - flags |= AOP_FLAG_UNINTERRUPTIBLE; - do { struct page *page; unsigned long offset; /* Offset into pagecache page */ diff --git a/mm/frame_vector.c b/mm/frame_vector.c index db77dcb38afd..72ebec18629c 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames) * Avoid higher order allocations, use vmalloc instead. It should * be rare anyway. */ - if (size <= PAGE_SIZE) - vec = kmalloc(size, GFP_KERNEL); - else - vec = vmalloc(size); + vec = kvmalloc(size, GFP_KERNEL); if (!vec) return NULL; vec->nr_allocated = nr_frames; @@ -387,11 +387,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, /* mlock all present pages, but do not fault in new pages */ if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) return -ENOENT; - /* For mm_populate(), just skip the stack guard page. */ - if ((*flags & FOLL_POPULATE) && - (stack_guard_page_start(vma, address) || - stack_guard_page_end(vma, address + PAGE_SIZE))) - return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) @@ -407,12 +402,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, ret = handle_mm_fault(vma, address, fault_flags); if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, *flags); + + if (err) + return err; BUG(); } @@ -723,12 +716,10 @@ retry: ret = handle_mm_fault(vma, address, fault_flags); major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) - return -EFAULT; + int err = vm_fault_to_errno(ret, 0); + + if (err) + return err; BUG(); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b787c4cfda0e..88c6167f194d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, + pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; @@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); } + + if (pgtable) { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + atomic_long_inc(&mm->nr_ptes); + } + set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); @@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; + pgtable_t pgtable = NULL; /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm, addr); + if (!pgtable) + return VM_FAULT_OOM; + } + track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1412,8 +1426,11 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) */ if (unlikely(pmd_trans_migrating(*vmf->pmd))) { page = pmd_page(*vmf->pmd); + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); + put_page(page); goto out; } @@ -1445,9 +1462,12 @@ int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { + page_nid = -1; + if (!get_page_unless_zero(page)) + goto out_unlock; spin_unlock(vmf->ptl); wait_on_page_locked(page); - page_nid = -1; + put_page(page); goto out; } @@ -1611,12 +1631,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_dax(vma)) { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { - pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { @@ -1625,10 +1646,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) { - pgtable_t pgtable; - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - pte_free(tlb->mm, pgtable); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e5828875f7bb..3eedb187e549 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4170,6 +4170,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, } ret = hugetlb_fault(mm, vma, vaddr, fault_flags); if (ret & VM_FAULT_ERROR) { + int err = vm_fault_to_errno(ret, flags); + + if (err) + return err; + remainder = 0; break; } diff --git a/mm/internal.h b/mm/internal.h index 04d08ef91224..0e4f558412fb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -183,6 +183,7 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + struct zone *zone; unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; @@ -190,17 +191,18 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ - int order; /* order a direct compactor needs */ - const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ - struct zone *zone; bool contended; /* Signal lock or sched contention */ + bool finishing_block; /* Finishing current pageblock */ }; unsigned long diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 9348d27088c1..c81549d5c833 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, *size += sizeof(struct kasan_alloc_meta); /* Add free meta. */ - if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || cache->object_size < sizeof(struct kasan_free_meta)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); @@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return; kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); @@ -572,7 +572,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) s8 shadow_byte; /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); @@ -691,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7cb9c88bb4a3..945fd1ca49b5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -612,7 +612,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spinlock_t *ptl) { pte_t *_pte; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; struct page *src_page; @@ -651,9 +652,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spin_unlock(ptl); free_page_and_swap_cache(src_page); } - - address += PAGE_SIZE; - page++; + cond_resched(); } } @@ -907,8 +906,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + if (mm_find_pmd(mm, address) != pmd) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; + } } if (ret & VM_FAULT_ERROR) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 5bf191756a4a..2d5959c5f7c5 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) { /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) kmemcheck_mark_freed(object, size); } @@ -1028,8 +1028,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, goto out; if (PageTransCompound(page)) { - err = split_huge_page(page); - if (err) + if (split_huge_page(page)) goto out_unlock; } diff --git a/mm/memblock.c b/mm/memblock.c index b049c9b2dba8..7b8a5db76a2f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1739,6 +1739,29 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } +extern unsigned long __init_memblock +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) +{ + struct memblock_region *rgn; + unsigned long size = 0; + int idx; + + for_each_memblock_type((&memblock.reserved), rgn) { + phys_addr_t start, end; + + if (rgn->base + rgn->size < start_addr) + continue; + if (rgn->base > end_addr) + continue; + + start = rgn->base; + end = start + rgn->size; + size += end - start; + } + + return size; +} + void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff73899af61a..94172089f52f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5528,7 +5528,7 @@ static void uncharge_list(struct list_head *page_list) next = page->lru.next; VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); if (!page->mem_cgroup) continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 73066b80d14a..ecc183fd94f3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -539,6 +539,13 @@ static int delete_from_lru_cache(struct page *p) */ ClearPageActive(p); ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(p); + /* * drop the page count elevated by isolate_lru_page() */ @@ -1177,7 +1184,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * page_remove_rmap() in try_to_unmap_one(). So to determine page status * correctly, we save a copy of the page flags at this time. */ - page_flags = p->flags; + if (PageHuge(p)) + page_flags = hpage->flags; + else + page_flags = p->flags; /* * unpoison always clear PG_hwpoison inside page lock @@ -1588,12 +1598,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); - /* - * We know that soft_offline_huge_page() tries to migrate - * only one hugepage pointed to by hpage, so we need not - * run through the pagelist here. - */ - putback_active_hugepage(hpage); + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); if (ret > 0) ret = -EIO; } else { diff --git a/mm/memory.c b/mm/memory.c index 6ff5d729ded0..bb11c474857e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2855,40 +2855,6 @@ out_release: } /* - * This is like a special single-page "expand_{down|up}wards()", - * except we must first make sure that 'address{-|+}PAGE_SIZE' - * doesn't hit another vma. - */ -static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) -{ - address &= PAGE_MASK; - if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { - struct vm_area_struct *prev = vma->vm_prev; - - /* - * Is there a mapping abutting this one below? - * - * That's only ok if it's the same stack mapping - * that has gotten split.. - */ - if (prev && prev->vm_end == address) - return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; - - return expand_downwards(vma, address - PAGE_SIZE); - } - if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { - struct vm_area_struct *next = vma->vm_next; - - /* As VM_GROWSDOWN but s/below/above/ */ - if (next && next->vm_start == address + PAGE_SIZE) - return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; - - return expand_upwards(vma, address + PAGE_SIZE); - } - return 0; -} - -/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2904,10 +2870,6 @@ static int do_anonymous_page(struct vm_fault *vmf) if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; - /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, vmf->address) < 0) - return VM_FAULT_SIGSEGV; - /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created @@ -3029,6 +2991,17 @@ static int __do_fault(struct vm_fault *vmf) return ret; } +/* + * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. + * If we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + static int pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3052,18 +3025,27 @@ static int pte_alloc_one_map(struct vm_fault *vmf) map_pte: /* * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd - * didn't become pmd_trans_huge under us and then back to pmd_none, as - * a result of MADV_DONTNEED running immediately after a huge pmd fault - * in a different thread of this mm, in turn leading to a misleading - * pmd_trans_huge() retval. All we have to ensure is that it is a - * regular pmd that we can walk with pte_offset_map() and we can do that - * through an atomic read in C, which is what pmd_trans_unstable() - * provides. + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of + * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge + * under us and then back to pmd_none, as a result of MADV_DONTNEED + * running immediately after a huge pmd fault in a different thread of + * this mm, in turn leading to a misleading pmd_trans_huge() retval. + * All we have to ensure is that it is a regular pmd that we can walk + * with pte_offset_map() and we can do that through an atomic read in + * C, which is what pmd_trans_unstable() provides. */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return VM_FAULT_NOPAGE; + /* + * At this point we know that our vmf->pmd points to a page of ptes + * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() + * for the duration of the fault. If a racing MADV_DONTNEED runs and + * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still + * be valid and we will re-check to make sure the vmf->pte isn't + * pte_none() under vmf->ptl protection when we return to + * alloc_set_pte(). + */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); return 0; @@ -3690,7 +3672,7 @@ static int handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) + if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge diff --git a/mm/mlock.c b/mm/mlock.c index c483c5c20b4b..b562b5523a65 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -284,7 +284,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); - int delta_munlocked; + int delta_munlocked = -nr; struct pagevec pvec_putback; int pgrescued = 0; @@ -304,6 +304,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) continue; else __munlock_isolation_failed(page); + } else { + delta_munlocked++; } /* @@ -315,7 +317,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - delta_munlocked = -nr + pagevec_count(&pvec_putback); __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(zone_lru_lock(zone)); diff --git a/mm/mmap.c b/mm/mmap.c index f82741e199c0..a5e3dcd75e79 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -183,6 +183,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; + struct vm_area_struct *next; unsigned long min_brk; bool populate; LIST_HEAD(uf); @@ -229,7 +230,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) } /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + next = find_vma(mm, oldbrk); + if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; /* Ok, looks good - let it rip. */ @@ -253,10 +255,22 @@ out: static long vma_compute_subtree_gap(struct vm_area_struct *vma) { - unsigned long max, subtree_gap; - max = vma->vm_start; - if (vma->vm_prev) - max -= vma->vm_prev->vm_end; + unsigned long max, prev_end, subtree_gap; + + /* + * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we + * allow two stack_guard_gaps between them here, and when choosing + * an unmapped area; whereas when expanding we only require one. + * That's a little inconsistent, but keeps the code here simpler. + */ + max = vm_start_gap(vma); + if (vma->vm_prev) { + prev_end = vm_end_gap(vma->vm_prev); + if (max > prev_end) + max -= prev_end; + else + max = 0; + } if (vma->vm_rb.rb_left) { subtree_gap = rb_entry(vma->vm_rb.rb_left, struct vm_area_struct, vm_rb)->rb_subtree_gap; @@ -352,7 +366,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } - highest_address = vma->vm_end; + highest_address = vm_end_gap(vma); vma = vma->vm_next; i++; } @@ -541,7 +555,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = vma->vm_end; + mm->highest_vm_end = vm_end_gap(vma); /* * vma->vm_prev wasn't known when we followed the rbtree to find the @@ -856,7 +870,7 @@ again: vma_gap_update(vma); if (end_changed) { if (!next) - mm->highest_vm_end = end; + mm->highest_vm_end = vm_end_gap(vma); else if (!adjust_next) vma_gap_update(next); } @@ -941,7 +955,7 @@ again: * mm->highest_vm_end doesn't need any update * in remove_next == 1 case. */ - VM_WARN_ON(mm->highest_vm_end != end); + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1787,7 +1801,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) while (true) { /* Visit left subtree if it looks promising */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end >= low_limit && vma->vm_rb.rb_left) { struct vm_area_struct *left = rb_entry(vma->vm_rb.rb_left, @@ -1798,12 +1812,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info) } } - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; check_current: /* Check if current node has a suitable gap */ if (gap_start > high_limit) return -ENOMEM; - if (gap_end >= low_limit && gap_end - gap_start >= length) + if (gap_end >= low_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit right subtree if it looks promising */ @@ -1825,8 +1840,8 @@ check_current: vma = rb_entry(rb_parent(prev), struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_left) { - gap_start = vma->vm_prev->vm_end; - gap_end = vma->vm_start; + gap_start = vm_end_gap(vma->vm_prev); + gap_end = vm_start_gap(vma); goto check_current; } } @@ -1890,7 +1905,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) while (true) { /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; if (gap_start <= high_limit && vma->vm_rb.rb_right) { struct vm_area_struct *right = rb_entry(vma->vm_rb.rb_right, @@ -1903,10 +1918,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) check_current: /* Check if current node has a suitable gap */ - gap_end = vma->vm_start; + gap_end = vm_start_gap(vma); if (gap_end < low_limit) return -ENOMEM; - if (gap_start <= high_limit && gap_end - gap_start >= length) + if (gap_start <= high_limit && + gap_end > gap_start && gap_end - gap_start >= length) goto found; /* Visit left subtree if it looks promising */ @@ -1929,7 +1945,7 @@ check_current: struct vm_area_struct, vm_rb); if (prev == vma->vm_rb.rb_right) { gap_start = vma->vm_prev ? - vma->vm_prev->vm_end : 0; + vm_end_gap(vma->vm_prev) : 0; goto check_current; } } @@ -1967,7 +1983,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; if (len > TASK_SIZE - mmap_min_addr) @@ -1978,9 +1994,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2003,7 +2020,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; unsigned long addr = addr0; struct vm_unmapped_area_info info; @@ -2018,9 +2035,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) return addr; } @@ -2155,21 +2173,19 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, * update accounting. This is shared with both the * grow-up and grow-down cases. */ -static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) +static int acct_stack_growth(struct vm_area_struct *vma, + unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; struct rlimit *rlim = current->signal->rlim; - unsigned long new_start, actual_size; + unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ - actual_size = size; - if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) - actual_size -= PAGE_SIZE; - if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -2207,16 +2223,32 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *next; + unsigned long gap_addr; int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* Guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else + /* Guard against exceeding limits of the address space. */ + address &= PAGE_MASK; + if (address >= TASK_SIZE) return -ENOMEM; + address += PAGE_SIZE; + + /* Enforce stack_guard_gap */ + gap_addr = address + stack_guard_gap; + + /* Guard against overflow */ + if (gap_addr < address || gap_addr > TASK_SIZE) + gap_addr = TASK_SIZE; + + next = vma->vm_next; + if (next && next->vm_start < gap_addr) { + if (!(next->vm_flags & VM_GROWSUP)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) @@ -2261,7 +2293,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_next) vma_gap_update(vma->vm_next); else - mm->highest_vm_end = address; + mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2282,6 +2314,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + struct vm_area_struct *prev; + unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2289,6 +2323,17 @@ int expand_downwards(struct vm_area_struct *vma, if (error) return error; + /* Enforce stack_guard_gap */ + gap_addr = address - stack_guard_gap; + if (gap_addr > address) + return -ENOMEM; + prev = vma->vm_prev; + if (prev && prev->vm_end > gap_addr) { + if (!(prev->vm_flags & VM_GROWSDOWN)) + return -ENOMEM; + /* Check that both stack segments have the same anon_vma? */ + } + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; @@ -2343,28 +2388,25 @@ int expand_downwards(struct vm_area_struct *vma, return error; } -/* - * Note how expand_stack() refuses to expand the stack all the way to - * abut the next virtual mapping, *unless* that mapping itself is also - * a stack mapping. We want to leave room for a guard page, after all - * (the guard page itself is not added here, that is done by the - * actual page faulting logic) - * - * This matches the behavior of the guard page logic (see mm/memory.c: - * check_stack_guard_page()), which only allows the guard page to be - * removed under these circumstances. - */ +/* enforced gap between the expanding stack and other mappings. */ +unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT; + +static int __init cmdline_parse_stack_guard_gap(char *p) +{ + unsigned long val; + char *endptr; + + val = simple_strtoul(p, &endptr, 10); + if (!*endptr) + stack_guard_gap = val << PAGE_SHIFT; + + return 0; +} +__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap); + #ifdef CONFIG_STACK_GROWSUP int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *next; - - address &= PAGE_MASK; - next = vma->vm_next; - if (next && next->vm_start == address + PAGE_SIZE) { - if (!(next->vm_flags & VM_GROWSUP)) - return -ENOMEM; - } return expand_upwards(vma, address); } @@ -2386,14 +2428,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) #else int expand_stack(struct vm_area_struct *vma, unsigned long address) { - struct vm_area_struct *prev; - - address &= PAGE_MASK; - prev = vma->vm_prev; - if (prev && prev->vm_end == address) { - if (!(prev->vm_flags & VM_GROWSDOWN)) - return -ENOMEM; - } return expand_downwards(vma, address); } @@ -2491,7 +2525,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_prev = prev; vma_gap_update(vma); } else - mm->highest_vm_end = prev ? prev->vm_end : 0; + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; /* Kill the cache */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a7652acd2ab9..54ca54562928 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -21,7 +21,7 @@ #include <linux/slab.h> /* global SRCU for all MMs */ -static struct srcu_struct srcu; +DEFINE_STATIC_SRCU(srcu); /* * This function allows mmu_notifier::release callback to delay a call to @@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); - /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ - BUG_ON(!srcu.per_cpu_ref); - ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); - -static int __init mmu_notifier_init(void) -{ - return init_srcu_struct(&srcu); -} -subsys_initcall(mmu_notifier_init); diff --git a/mm/nommu.c b/mm/nommu.c index 2d131b97a851..fc184f597d59 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -237,12 +237,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc(size, flags, PAGE_KERNEL); +} + void *vmalloc_user(unsigned long size) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2c25de46c58f..2302f250d6b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -292,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static inline void reset_deferred_meminit(pg_data_t *pgdat) { + unsigned long max_initialise; + unsigned long reserved_lowmem; + + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); + + /* + * Compensate the all the memblock reservations (e.g. crash kernel) + * from the initial estimation to make sure we will initialize enough + * memory to boot. + */ + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, + pgdat->node_start_pfn + max_initialise); + max_initialise += reserved_lowmem; + + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -314,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - unsigned long max_initialise; - /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - (*nr_initialised)++; - if ((*nr_initialised > max_initialise) && + if ((*nr_initialised > pgdat->static_init_size) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -1832,9 +1843,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1851,6 +1862,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1861,6 +1875,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1876,7 +1899,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1893,7 +1916,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1943,28 +1967,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); - int pages; + struct free_area *area; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; } - pages = move_freepages_block(zone, page, start_type); + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; + } + + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2034,7 +2109,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2111,7 +2186,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2123,8 +2199,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2145,32 +2226,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && !is_migrate_highatomic_page(page)) - steal_suitable_fallback(zone, page, start_migratetype); - - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2182,13 +2248,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -3227,14 +3294,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned int noreclaim_flag; if (!order) return NULL; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3381,12 +3449,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, { struct reclaim_state reclaim_state; int progress; + unsigned int noreclaim_flag; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3396,7 +3465,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3609,6 +3678,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3676,12 +3746,17 @@ retry_cpuset: /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3693,7 +3768,7 @@ retry_cpuset: * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3774,7 +3849,7 @@ retry: * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3806,7 +3881,9 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE)) + if (test_thread_flag(TIF_MEMDIE) && + (alloc_flags == ALLOC_NO_WATERMARKS || + (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; /* Retry as long as the OOM killer is making progress */ @@ -6072,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); - reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; pgdat->per_cpu_nodestats = NULL; @@ -6094,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif + reset_deferred_meminit(pgdat); free_area_init_core(pgdat); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 7927bbb54a4e..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ out: set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); diff --git a/mm/rmap.c b/mm/rmap.c index 3ff241f714eb..d405f0e0ee96 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); @@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) * If this page is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!page_mapped(page)) { diff --git a/mm/slab.c b/mm/slab.c index 1880d482a0cb..2a31ee3c5814 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&page->rcu_head, kmem_rcu_free); else kmem_freepages(cachep, page); @@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; - if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; left = calculate_slab_order(cachep, size, @@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; - if (!(flags & SLAB_DESTROY_BY_RCU)) + if (!(flags & SLAB_TYPESAFE_BY_RCU)) flags |= SLAB_POISON; #endif #endif diff --git a/mm/slab.h b/mm/slab.h index 65e7c3fcac72..9cfcf099709c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation diff --git a/mm/slab_common.c b/mm/slab_common.c index 09d0e849b07f..01a0fe2eb332 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, * Set of flags that will prevent slab merging */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ @@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) struct kmem_cache *s, *s2; /* - * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the * @slab_caches_to_rcu_destroy list. The slab pages are freed * through RCU and and the associated kmem_cache are dereferenced * while freeing the pages, so the kmem_caches should be freed only @@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s) memcg_unlink_cache(s); list_del(&s->list); - if (s->flags & SLAB_DESTROY_BY_RCU) { + if (s->flags & SLAB_TYPESAFE_BY_RCU) { list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { diff --git a/mm/slob.c b/mm/slob.c index eac04d4357ec..1bae78d71096 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp) /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which - * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free + * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free * the block using call_rcu. */ struct slob_rcu { @@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - if (flags & SLAB_DESTROY_BY_RCU) { + if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } @@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); slob_rcu->size = c->size; diff --git a/mm/slub.c b/mm/slub.c index 7f4bc7027ed5..7449593fca72 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { - if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { struct rcu_head *head; if (need_reserve_slab_rcu) { @@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, * slab_free_freelist_hook() could have put the items into quarantine. * If so, no need to free them. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) return; do_slab_free(s, page, head, tail, cnt, addr); } @@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the slab may touch the object after free or before allocation * then we should never poison the object itself. */ - if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else @@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->inuse = size; - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor)) { /* * Relocate free pointer after the object if it is not @@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; - if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) @@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } SLAB_ATTR_RO(destroy_by_rcu); @@ -5512,6 +5512,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) char mbuf[64]; char *buf; struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); + ssize_t len; if (!attr || !attr->store || !attr->show) continue; @@ -5536,8 +5537,9 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(root_cache, buf); - attr->store(s, buf, strlen(buf)); + len = attr->show(root_cache, buf); + if (len > 0) + attr->store(s, buf, len); } if (buffer) diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index ac6318a064d3..3405b4ee1757 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -48,6 +48,9 @@ static int swap_cgroup_prepare(int type) if (!page) goto not_enough_page; ctrl->map[idx] = page; + + if (!(idx % SWAP_CLUSTER_MAX)) + cond_resched(); } return 0; not_enough_page: diff --git a/mm/swap_slots.c b/mm/swap_slots.c index aa1c415f4abd..58f6c78f1dad 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -31,6 +31,7 @@ #include <linux/cpumask.h> #include <linux/vmalloc.h> #include <linux/mutex.h> +#include <linux/mm.h> #ifdef CONFIG_SWAP @@ -119,16 +120,18 @@ static int alloc_swap_slot_cache(unsigned int cpu) /* * Do allocation outside swap_slots_cache_mutex - * as vzalloc could trigger reclaim and get_swap_page, + * as kvzalloc could trigger reclaim and get_swap_page, * which can lock swap_slots_cache_mutex. */ - slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + GFP_KERNEL); if (!slots) return -ENOMEM; - slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + GFP_KERNEL); if (!slots_ret) { - vfree(slots); + kvfree(slots); return -ENOMEM; } @@ -152,9 +155,9 @@ static int alloc_swap_slot_cache(unsigned int cpu) out: mutex_unlock(&swap_slots_cache_mutex); if (slots) - vfree(slots); + kvfree(slots); if (slots_ret) - vfree(slots_ret); + kvfree(slots_ret); return 0; } @@ -171,7 +174,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, cache->cur = 0; cache->nr = 0; if (free_slots && cache->slots) { - vfree(cache->slots); + kvfree(cache->slots); cache->slots = NULL; } mutex_unlock(&cache->alloc_lock); @@ -186,7 +189,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } spin_unlock_irq(&cache->free_lock); if (slots) - vfree(slots); + kvfree(slots); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 7bfb9bd1ca21..539b8885e3d1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -523,7 +523,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = vzalloc(sizeof(struct address_space) * nr); + spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { diff --git a/mm/swapfile.c b/mm/swapfile.c index b86b2aca3fb9..4f6cba1b6632 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2270,8 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; vfree(swap_map); - vfree(cluster_info); - vfree(frontswap_map); + kvfree(cluster_info); + kvfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); @@ -2794,7 +2794,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->cluster_next = 1 + (prandom_u32() % p->highest_bit); nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); + cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info), + GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; goto bad_swap; @@ -2827,7 +2828,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); + frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long), + GFP_KERNEL); if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* diff --git a/mm/truncate.c b/mm/truncate.c index 83a059e8cd1d..6479ed2afc53 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping, /* * Invalidate exceptional entry if easily possible. This handles exceptional - * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and - * clean entries. + * entries for invalidate_inode_pages(). */ static int invalidate_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { - /* Handled by shmem itself */ - if (shmem_mapping(mapping)) + /* Handled by shmem itself, or for DAX we do nothing. */ + if (shmem_mapping(mapping) || dax_mapping(mapping)) return 1; - if (dax_mapping(mapping)) - return dax_invalidate_mapping_entry(mapping, index); clear_shadow_entry(mapping, index, entry); return 1; } @@ -689,7 +686,17 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cond_resched(); index++; } - + /* + * For DAX we invalidate page tables after invalidating radix tree. We + * could invalidate page tables while invalidating each entry however + * that would be expensive. And doing range unmapping before doesn't + * work as we have no cheap way to find whether radix tree entry didn't + * get remapped later. + */ + if (dax_mapping(mapping)) { + unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, + (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + } out: cleancache_invalidate_inode(mapping); return ret; diff --git a/mm/util.c b/mm/util.c index 656dc5e37a87..26be6407abd7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -329,6 +329,67 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +/** + * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT + * is supported only for large (>32kB) allocations, and it should be used only if + * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + */ +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + gfp_t kmalloc_flags = flags; + void *ret; + + /* + * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) + * so the given set of flags has to be compatible. + */ + WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + kmalloc_flags |= __GFP_NOWARN; + + /* + * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly + * requests because there is no other way to tell the allocator + * that we want to fail rather than retry endlessly. + */ + if (!(kmalloc_flags & __GFP_REPEAT) || + (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + kmalloc_flags |= __GFP_NORETRY; + } + + ret = kmalloc_node(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + if (ret || size <= PAGE_SIZE) + return ret; + + return __vmalloc_node_flags_caller(size, node, flags, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kvmalloc_node); + void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b52aeed3f58e..34a1c3e46ed7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -521,7 +521,7 @@ overflow: } } - if (printk_ratelimit()) + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); kfree(va); @@ -1658,7 +1658,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1786,6 +1786,13 @@ fail: * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1809,6 +1816,13 @@ static inline void *__vmalloc_node_flags(unsigned long size, node, __builtin_return_address(0)); } + +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1821,7 +1835,7 @@ static inline void *__vmalloc_node_flags(unsigned long size, void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM); + GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -1838,7 +1852,7 @@ EXPORT_SYMBOL(vmalloc); void *vzalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1855,7 +1869,7 @@ void *vmalloc_user(unsigned long size) void *ret; ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { @@ -1879,7 +1893,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1899,7 +1913,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc_node); @@ -1921,7 +1935,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 6063581f705c..ce0618bfa8d0 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -115,9 +115,9 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, unsigned long pressure = 0; /* - * reclaimed can be greater than scanned in cases - * like THP, where the scanned is 1 and reclaimed - * could be 512 + * reclaimed can be greater than scanned for things such as reclaimed + * slab pages. shrink_node() just adds reclaimed pages without a + * related increment to scanned pages. */ if (reclaimed >= scanned) goto out; diff --git a/mm/vmscan.c b/mm/vmscan.c index 4e7ed65842af..8ad39bbc79e6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1449,7 +1449,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * Appropriate locks must be held before calling this function. * - * @nr_to_scan: The number of pages to look through on the list. + * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. @@ -1469,11 +1469,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; - unsigned long scan, nr_pages; + unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); - for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src); scan++) { + scan = 0; + for (total_scan = 0; + scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); + total_scan++) { struct page *page; page = lru_to_page(src); @@ -1487,6 +1489,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; } + /* + * Do not count skipped pages because that makes the function + * return with no isolated pages if the LRU mostly contains + * ineligible pages. This causes the VM to not reclaim any + * pages, triggering a premature OOM. + */ + scan++; switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1524,9 +1533,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, skipped += nr_skipped[zid]; } } - *nr_scanned = scan; + *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - scan, skipped, nr_taken, mode, lru); + total_scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -3036,6 +3045,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | @@ -3062,9 +3072,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3589,8 +3599,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; + unsigned int noreclaim_flag; - p->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3599,7 +3610,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } @@ -3764,6 +3775,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in struct task_struct *p = current; struct reclaim_state reclaim_state; int classzone_idx = gfp_zone(gfp_mask); + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), @@ -3781,7 +3793,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_UNMAP. */ - p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + noreclaim_flag = memalloc_noreclaim_save(); + p->flags |= PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3797,7 +3810,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + current->flags &= ~PF_SWAPWRITE; + memalloc_noreclaim_restore(noreclaim_flag); lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index f5fa1bd1eb16..76f73670200a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1359,8 +1359,6 @@ static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) return zone == compare; } - /* The zone must be somewhere! */ - WARN_ON_ONCE(1); return false; } |