diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 213 |
1 files changed, 183 insertions, 30 deletions
diff --git a/mm/memory.c b/mm/memory.c index 90cea22001ef..d0f0bef3be48 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,6 +60,7 @@ #include <linux/migrate.h> #include <linux/string.h> #include <linux/dma-debug.h> +#include <linux/debugfs.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. @@ -1705,15 +1706,6 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (gup_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (gup_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - /* * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault * would be called on PROT_NONE ranges. We must never invoke @@ -1741,7 +1733,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) - return i ? : -EFAULT; + goto efault; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); else @@ -1751,12 +1743,12 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) - return i ? : -EFAULT; + goto efault; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } vma = get_gate_vma(mm); if (pages) { @@ -1769,7 +1761,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, page = pte_page(*pte); else { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } } pages[i] = page; @@ -1780,10 +1772,42 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto next_page; } - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - return i ? : -EFAULT; + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, @@ -1837,7 +1861,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; } if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; + goto efault; BUG(); } @@ -1895,6 +1919,8 @@ next_page: } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; +efault: + return i ? : -EFAULT; } EXPORT_SYMBOL(__get_user_pages); @@ -1962,9 +1988,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * @start: starting user address * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. This will result in the page being COWed even - * in MAP_SHARED mappings. You do not want this. + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -2757,7 +2782,7 @@ reuse: */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); @@ -2803,7 +2828,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -3256,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3318,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } -static void do_set_pte(struct vm_area_struct *vma, unsigned long address, +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, struct page *page, pte_t *pte, bool write, bool anon) { pte_t entry; @@ -3342,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } +#define FAULT_AROUND_ORDER 4 + +#ifdef CONFIG_DEBUG_FS +static unsigned int fault_around_order = FAULT_AROUND_ORDER; + +static int fault_around_order_get(void *data, u64 *val) +{ + *val = fault_around_order; + return 0; +} + +static int fault_around_order_set(void *data, u64 val) +{ + BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); + if (1UL << val > PTRS_PER_PTE) + return -EINVAL; + fault_around_order = val; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, + fault_around_order_get, fault_around_order_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, + &fault_around_order_fops); + if (!ret) + pr_warn("Failed to create fault_around_order in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); + +static inline unsigned long fault_around_pages(void) +{ + return 1UL << fault_around_order; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); +} +#else +static inline unsigned long fault_around_pages(void) +{ + unsigned long nr_pages; + + nr_pages = 1UL << FAULT_AROUND_ORDER; + BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); + return nr_pages; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); +} +#endif + +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + start_addr = max(address & fault_around_mask(), vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + + /* + * max_pgoff is either end of page table or end of vma + * or fault_around_pages() from pgoff, depending what is neast. + */ + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + fault_around_pages() - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; + } + + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) @@ -3349,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *fault_page; spinlock_t *ptl; pte_t *pte; - int ret; + int ret = 0; + + /* + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). + */ + if (vma->vm_ops->map_pages) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3363,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } do_set_pte(vma, address, fault_page, pte, false, false); - pte_unmap_unlock(pte, ptl); unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); return ret; } @@ -3384,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { page_cache_release(new_page); return VM_FAULT_OOM; } |