diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 8 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 60 | ||||
-rw-r--r-- | mm/huge_memory.c | 79 | ||||
-rw-r--r-- | mm/hugetlb.c | 18 | ||||
-rw-r--r-- | mm/internal.h | 15 | ||||
-rw-r--r-- | mm/kasan/Makefile | 2 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 152 | ||||
-rw-r--r-- | mm/maccess.c | 41 | ||||
-rw-r--r-- | mm/madvise.c | 9 | ||||
-rw-r--r-- | mm/memblock.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 26 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/mempolicy.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 18 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 44 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 50 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 9 | ||||
-rw-r--r-- | mm/page_io.c | 20 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 118 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slab.h | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 23 | ||||
-rw-r--r-- | mm/slob.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 206 | ||||
-rw-r--r-- | mm/swapfile.c | 25 | ||||
-rw-r--r-- | mm/userfaultfd.c | 308 | ||||
-rw-r--r-- | mm/vmscan.c | 30 |
35 files changed, 1135 insertions, 226 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e79de2bd12cd..d4e6495a720f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -299,15 +299,9 @@ config BOUNCE # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often # have more than 4GB of memory, but we don't currently use the IOTLB to present # a 32-bit address to OHCI. So we need to use a bounce pool instead. -# -# We also use the bounce pool to provide stable page writes for jbd. jbd -# initiates buffer writeback without locking the page or setting PG_writeback, -# and fixing that behavior (a second time; jbd2 doesn't have this problem) is -# a major rework effort. Instead, use the bounce buffer to snapshot pages -# (until jbd goes away). The only jbd user is ext3. config NEED_BOUNCE_POOL bool - default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) + default y if TILE && USB_OHCI_HCD config NR_QUICK int diff --git a/mm/Makefile b/mm/Makefile index 98c4eaeabdcb..b424d5e5b6ff 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..ee8d7fd07be3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,13 +55,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) + list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; - list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); diff --git a/mm/dmapool.c b/mm/dmapool.c index fd5fe4342e93..59d10d16f0a5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return page; } -static inline int is_page_busy(struct dma_page *page) +static inline bool is_page_busy(struct dma_page *page) { return page->in_use != 0; } @@ -12,7 +12,9 @@ #include <linux/sched.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> + #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include "internal.h" @@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma, return NULL; } +static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, unsigned int flags) +{ + /* No page to get reference */ + if (flags & FOLL_GET) + return -EFAULT; + + if (flags & FOLL_TOUCH) { + pte_t entry = *pte; + + if (flags & FOLL_WRITE) + entry = pte_mkdirty(entry); + entry = pte_mkyoung(entry); + + if (!pte_same(*pte, entry)) { + set_pte_at(vma->vm_mm, address, pte, entry); + update_mmu_cache(vma, address, pte); + } + } + + /* Proper page table entry exists, but no corresponding struct page */ + return -EEXIST; +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -73,10 +99,21 @@ retry: page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); + if (flags & FOLL_DUMP) { + /* Avoid special (like zero) pages in core dumps */ + page = ERR_PTR(-EFAULT); + goto out; + } + + if (is_zero_pfn(pte_pfn(pte))) { + page = pte_page(pte); + } else { + int ret; + + ret = follow_pfn_pte(vma, address, ptep, flags); + page = ERR_PTR(ret); + goto out; + } } if (flags & FOLL_GET) @@ -114,12 +151,9 @@ retry: unlock_page(page); } } +out: pte_unmap_unlock(ptep, ptl); return page; -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) @@ -489,9 +523,15 @@ retry: goto next_page; } BUG(); - } - if (IS_ERR(page)) + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. + */ + goto next_page; + } else if (IS_ERR(page)) { return i ? i : PTR_ERR(page); + } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 097c7a4bfbd9..279a818a39b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + unsigned long address, pmd_t *pmd, + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; + unsigned long haddr = address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; - if (!pmd_none(*pmd)) - return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); - return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - - count_vm_event(THP_FAULT_ALLOC); - return 0; + return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, + flags); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -2133,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2586,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087089d8..51ae41d0fbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma, long chg) +static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { /* @@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) * properly, so add work-around here. */ if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return 1; + return true; else - return 0; + return false; } /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) - return 1; + return true; /* * Only the process that called mmap() has reserves for * private mappings. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; + return true; - return 0; + return false; } static void enqueue_huge_page(struct hstate *h, struct page *page) @@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & PUD_MASK; unsigned long end = base + PUD_SIZE; @@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) */ if (vma->vm_flags & VM_MAYSHARE && vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; + return true; + return false; } /* diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..1195dd2d6a2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_FAIR 0x100 /* fair zone allocation */ +enum ttu_flags; +struct tlbflush_unmap_batch; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +void try_to_unmap_flush_dirty(void); +#else +static inline void try_to_unmap_flush(void) +{ +} +static inline void try_to_unmap_flush_dirty(void) +{ +} + +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index bd837b8c2f41..64710148941e 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o +obj-y := kasan.o report.o kasan_init.o diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c new file mode 100644 index 000000000000..3f9a41cf0ac6 --- /dev/null +++ b/mm/kasan/kasan_init.c @@ -0,0 +1,152 @@ +/* + * This file contains some kasan initialization code. + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/pfn.h> + +#include <asm/page.h> +#include <asm/pgalloc.h> + +/* + * This page serves two purposes: + * - It used as early shadow memory. The entire shadow region populated + * with this page, before we will be able to setup normal shadow memory. + * - Latter it reused it as zero shadow to cover large ranges of memory + * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). + */ +unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; + +#if CONFIG_PGTABLE_LEVELS > 3 +pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +#endif +pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +static __init void *early_alloc(size_t size, int node) +{ + return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, node); +} + +static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_t zero_pte; + + zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pte_wrprotect(zero_pte); + + while (addr + PAGE_SIZE <= end) { + set_pte_at(&init_mm, addr, pte, zero_pte); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } +} + +static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, addr); + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pmd_none(*pmd)) { + pmd_populate_kernel(&init_mm, pmd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pte_populate(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + pud_t *pud = pud_offset(pgd, addr); + unsigned long next; + + do { + next = pud_addr_end(addr, end); + if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pmd_t *pmd; + + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pud_none(*pud)) { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pmd_populate(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +/** + * kasan_populate_zero_shadow - populate shadow memory region with + * kasan_zero_page + * @shadow_start - start of the memory range to populate + * @shadow_end - end of the memory range to populate + */ +void __init kasan_populate_zero_shadow(const void *shadow_start, + const void *shadow_end) +{ + unsigned long addr = (unsigned long)shadow_start; + unsigned long end = (unsigned long)shadow_end; + pgd_t *pgd = pgd_offset_k(addr); + unsigned long next; + + do { + next = pgd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pud_t *pud; + pmd_t *pmd; + + /* + * kasan_zero_pud should be populated with pmds + * at this moment. + * [pud,pmd]_populate*() below needed only for + * 3,2 - level page tables where we don't have + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + */ + pgd_populate(&init_mm, pgd, kasan_zero_pud); + pud = pud_offset(pgd, addr); + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pgd_none(*pgd)) { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pud_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} diff --git a/mm/maccess.c b/mm/maccess.c index d53adf9ba84b..34fe24759ed1 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -60,3 +60,44 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(probe_kernel_write); + +/** + * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Unsafe address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + const void *src = unsafe_addr; + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(dst++, + (const void __user __force *)src++, 1); + } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + return ret < 0 ? ret : src - unsafe_addr; +} diff --git a/mm/madvise.c b/mm/madvise.c index 64bb8a22110c..ce3a4222c7e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; @@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, } } -static int +static bool madvise_behavior_valid(int behavior) { switch (behavior) { @@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: - return 1; + return true; default: - return 0; + return false; } } diff --git a/mm/memblock.c b/mm/memblock.c index 87108e77e476..95ce68c6da8a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,6 +566,9 @@ repeat: * area, insert that portion. */ if (rbase > base) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + WARN_ON(nid != memblock_get_region_node(rgn)); +#endif nr_new++; if (insert) memblock_insert_region(type, i++, base, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acb93c554f6e..1af057575ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); - /* Caller disabled preemption with mapping->tree_lock */ + /* + * Interrupts should be disabled here because the caller holds the + * mapping->tree_lock lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for udpating the per-CPU variables. + */ + VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); } diff --git a/mm/memory.c b/mm/memory.c index 388dcf9aa283..bb04d8f2f86c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,7 @@ #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> +#include <linux/userfaultfd_k.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) #ifdef HAVE_GENERIC_MMU_GATHER -static int tlb_next_batch(struct mmu_gather *tlb) +static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; - return 1; + return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return 0; + return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) - return 0; + return false; tlb->batch_count++; batch->next = NULL; @@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) tlb->active->next = batch; tlb->active = batch; - return 1; + return true; } /* tlb_gather_mmu @@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } goto setpte; } @@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); mem_cgroup_commit_charge(page, memcg, false); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6da82bcb0a8b..8fd97dac538a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); + /* + * Add new range to memblock so that when hotadd_new_pgdat() is called + * to allocate new pgdat, get_pfn_range_for_nid() will be able to find + * this new range and calculate total pages correctly. The range will + * be removed at hot-remove time. + */ + memblock_add_node(start, size, nid); + new_node = !node_online(nid); if (new_node) { pgdat = hotadd_new_pgdat(nid, start); @@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); - memblock_add_node(start, size, nid); goto out; @@ -1286,6 +1293,7 @@ error: if (new_pgdat) rollback_node_hotadd(nid, pgdat); release_memory_resource(res); + memblock_remove(start, size); out: mem_hotplug_done(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 99d4c1d0b858..a7f1e0d1d6b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/migrate.c b/mm/migrate.c index eb4267107d1f..5c08cab5419e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, pp->addr, + FOLL_GET | FOLL_SPLIT | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - /* Use PageReserved to check for zero page */ - if (PageReserved(page)) - goto put_and_set; - pp->page = page; err = page_to_nid(page); @@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma || addr < vma->vm_start) goto set_status; - page = follow_page(vma, addr, 0); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) goto set_status; - err = -ENOENT; - /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) - goto set_status; - - err = page_to_nid(page); + err = page ? page_to_nid(page) : -ENOENT; set_status: *status = err; diff --git a/mm/mlock.c b/mm/mlock.c index 6fd2cf15e868..25936680064f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index aa632ade2be7..82db4fc0a9d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -41,6 +41,7 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> +#include <linux/userfaultfd_k.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end); * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; return 1; } @@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { + anon_vma, file, pgoff, + vm_userfaultfd_ctx)) { /* * OK, it can. Can we now merge in the successor as well? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && + anon_vma, file, + pgoff+pglen, + vm_userfaultfd_ctx) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1268,7 +1281,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; if (!(flags & MAP_FIXED)) @@ -1337,7 +1350,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; @@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, - NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e7d6f1171ecb..ef5be8eaab00 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; goto success; diff --git a/mm/mremap.c b/mm/mremap.c index a7c93eceb1c8..5a71cce8c6ea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { + err = -ENOMEM; + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); + } + + if (unlikely(err)) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, @@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma = new_vma; old_len = new_len; old_addr = new_addr; - new_addr = -ENOMEM; + new_addr = err; } else { - if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, - old_addr, moved_len, true); - return err; - } - } arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); + unsigned long pgoff; if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); @@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); + if (new_len == old_len) + return vma; + /* Need to be careful about a growing mapping */ - if (new_len > old_len) { - unsigned long pgoff; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); - } + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) goto out; - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) + /* Ensure the old/new locations do not overlap */ + if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; ret = do_munmap(mm, new_addr, new_len); @@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) + if (ret & ~PAGE_MASK) { vm_unacct_memory(charged); + locked = 0; + } up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); diff --git a/mm/nommu.c b/mm/nommu.c index 58ea3643b9e9..1cc0709fcaa5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -324,12 +324,12 @@ long vwrite(char *buf, char *addr, unsigned long count) } /* - * vmalloc - allocate virtually continguos memory + * vmalloc - allocate virtually contiguous memory * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. @@ -341,12 +341,12 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /* - * vzalloc - allocate virtually continguos memory with zero fill + * vzalloc - allocate virtually contiguous memory with zero fill * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags @@ -420,7 +420,7 @@ void *vmalloc_exec(unsigned long size) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into continguos kernel virtual space. + * page level allocator and map them into contiguous kernel virtual space. */ void *vmalloc_32(unsigned long size) { @@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (prot & PROT_EXEC) return -EPERM; } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df959b7d6085..5b5240b7f642 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -3345,7 +3348,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } diff --git a/mm/page_io.c b/mm/page_io.c index 520baa4b04d7..b995a5ba5e8f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -33,22 +33,19 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, if (bio) { bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_io_vec[0].bv_page = page; - bio->bi_io_vec[0].bv_len = PAGE_SIZE; - bio->bi_io_vec[0].bv_offset = 0; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; + + bio_add_page(bio, page, PAGE_SIZE, 0); + BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); } return bio; } -void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -69,12 +66,11 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -static void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); ClearPageUptodate(page); printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", @@ -254,7 +250,7 @@ static sector_t swap_page_sector(struct page *page) } int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)) + bio_end_io_t end_write_func) { struct bio *bio; int ret, rw = WRITE; diff --git a/mm/percpu.c b/mm/percpu.c index 2dd74487a0af..a63b4d82a141 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1668,9 +1668,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map[1] = ai->static_size; schunk->map_used = 1; if (schunk->free_size) - schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); - else - schunk->map[1] |= 1; + schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; + schunk->map[schunk->map_used] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df1..0db38e7d0a72 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -62,6 +62,8 @@ #include <asm/tlbflush.h> +#include <trace/events/tlb.h> + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void percpu_flush_tlb_batch_pages(void *data) +{ + /* + * All TLB entries are flushed on the assumption that it is + * cheaper to flush all TLBs and let them be refilled than + * flushing individual PFNs. Note that we do not track mm's + * to flush as that might simply be multiple full TLB flushes + * for no gain. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_local(); +} + +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int cpu; + + if (!tlb_ubc->flush_required) + return; + + cpu = get_cpu(); + + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); + + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { + smp_call_function_many(&tlb_ubc->cpumask, + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + } + cpumask_clear(&tlb_ubc->cpumask); + tlb_ubc->flush_required = false; + tlb_ubc->writable = false; + put_cpu(); +} + +/* Flush iff there are potentially writable TLB entries that can race with IO */ +void try_to_unmap_flush_dirty(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (tlb_ubc->writable) + try_to_unmap_flush(); +} + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + tlb_ubc->flush_required = true; + + /* + * If the PTE was dirty then it's best to assume it's writable. The + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() + * before the page is queued for IO. + */ + if (writable) + tlb_ubc->writable = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. @@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially a remote + * CPU could still be writing to the page. If the entry was + * previously clean then the architecture must guarantee that + * a clear->dirty transition on a cached TLB entry is written + * through and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pte); + + set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pte); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) diff --git a/mm/slab.c b/mm/slab.c index 200e22412a16..60c936938b84 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, } /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (unlikely(page->pfmemalloc)) + if (page_is_pfmemalloc(page)) pfmemalloc_active = true; nr_pages = (1 << cachep->gfporder); @@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 8da63e4e470f..a3a967d7d7c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold @@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, cachep->name, s->name); + __func__, s->name, cachep->name); WARN_ON_ONCE(1); return s; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 86831105a09f..c26829fe4e37 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + } + return true; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { diff --git a/mm/slob.c b/mm/slob.c index 4765f65019c7..165bbd3cd606 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 816df0016555..084184e706c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } +} + /* * Slab allocation and freeing */ @@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + void *start, *p; + int idx, order; flags &= gfp_allowed_mask; @@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(s, alloc_gfp, node, oo); - - if (page) - stat(s, ORDER_FALLBACK); + if (unlikely(!page)) + goto out; + stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && page - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + if (kmemcheck_enabled && + !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); @@ -1380,54 +1395,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } - if (flags & __GFP_WAIT) - local_irq_disable(); - if (!page) - return NULL; - page->objects = oo_objects(oo); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - - return page; -} - -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) -{ - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); - } -} - -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) -{ - struct page *page; - void *start; - void *p; - int order; - int idx; - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); - } - - page = allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - if (!page) - goto out; order = compound_order(page); - inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); @@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = page->objects; page->frozen = 1; + out: + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + inc_slabs_node(s, page_to_nid(page), page->objects); + return page; } +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + static void __free_slab(struct kmem_cache *s, struct page *page) { int order = compound_order(page); @@ -2712,7 +2709,7 @@ redo: * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ do { tid = this_cpu_read(s->cpu_slab->tid); @@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct kmem_cache_cpu *c; + struct page *page; + int i; + + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = p[i]; + + BUG_ON(!object); + /* kmem cache debug support */ + s = cache_from_obj(s, object); + if (unlikely(!s)) + goto exit; + slab_free_hook(s, object); + + page = virt_to_head_page(object); + + if (c->page == page) { + /* Fastpath: local CPU free */ + set_freepointer(s, object, c->freelist); + c->freelist = object; + } else { + c->tid = next_tid(c->tid); + local_irq_enable(); + /* Slowpath: overhead locked cmpxchg_double_slab */ + __slab_free(s, page, object, _RET_IP_); + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + } + } +exit: + c->tid = next_tid(c->tid); + local_irq_enable(); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + int i; + + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + local_irq_enable(); + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c); + if (unlikely(!p[i])) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + continue; /* goto for-loop */ + } + + /* kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) { + __kmem_cache_free_bulk(s, i, p); + c->tid = next_tid(c->tid); + local_irq_enable(); + return false; + } + + c->freelist = get_freepointer(s, object); + p[i] = object; + + /* kmem_cache debug support */ + slab_post_alloc_hook(s, flags, object); + } + c->tid = next_tid(c->tid); + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + + return true; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) - goto out_put_kobj; + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5208,8 +5312,6 @@ out: return err; out_del_kobj: kobject_del(&s->kobj); -out_put_kobj: - kobject_put(&s->kobj); goto out; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 41e4581af7c5..aebc2dd6e649 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (error < 0) { p->bdev = NULL; - return -EINVAL; + return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); @@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; - int i; int prio; int error; union swap_header *swap_header; @@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; - - if (q == p || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) { - error = -EBUSY; - goto bad_swap; - } - } - inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) @@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + p->flags |= SWP_SOLIDSTATE; /* * select a random position to start with to help wear leveling @@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } - for_each_possible_cpu(i) { + for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; - cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000000..77fee9325a57 --- /dev/null +++ b/mm/userfaultfd.c @@ -0,0 +1,308 @@ +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/userfaultfd_k.h> +#include <linux/mmu_notifier.h> +#include <asm/tlbflush.h> +#include "internal.h" + +static int mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct mem_cgroup *memcg; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + void *page_kaddr; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); + if (!page) + goto out; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + goto out_release; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + inc_mm_counter(dst_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, dst_vma); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); + mem_cgroup_cancel_charge(page, memcg); +out_release: + page_cache_release(page); + goto out; +} + +static int mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_unlock; + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (pud) + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + pmd = pmd_alloc(mm, pud, address); + return pmd; +} + +static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + + /* + * Sanitize the command parameters: + */ + BUG_ON(dst_start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(src_start + len <= src_start); + BUG_ON(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; +retry: + down_read(&dst_mm->mmap_sem); + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + /* + * Be strict and only allow __mcopy_atomic on userfaultfd + * registered ranges to prevent userland errors going + * unnoticed. As far as the VM consistency is concerned, it + * would be perfectly safe to remove this check, but there's + * no useful usage for __mcopy_atomic ouside of userfaultfd + * registered ranges. This is after all why these are ioctls + * belonging to the userfaultfd and not syscalls. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * FIXME: only allow copying on anonymous vmas, tmpfs should + * be added. + */ + if (dst_vma->vm_ops) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + err = -ENOMEM; + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + BUG_ON(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmd_read_atomic(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, + dst_addr))) { + err = -ENOMEM; + break; + } + /* If an huge pmd materialized from under us fail */ + if (unlikely(pmd_trans_huge(*dst_pmd))) { + err = -EFAULT; + break; + } + + BUG_ON(pmd_none(*dst_pmd)); + BUG_ON(pmd_trans_huge(*dst_pmd)); + + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, + dst_addr); + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + void *page_kaddr; + + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + page_kaddr = kmap(page); + err = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap(page); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) + page_cache_release(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} + +ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len) +{ + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); +} + +ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len) +{ + return __mcopy_atomic(dst_mm, start, 0, len, true); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index 8286938c70de..b1139039122a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1097,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; @@ -1208,6 +1214,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -2151,6 +2158,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2185,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { |