From 78d683e838a60ec4ba4591cca4364cba84a9e626 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:32 -0700 Subject: mm, fs: Add vm_ops->name as an alternative to arch_vma_name arch_vma_name sucks. It's a silly hack, and it's annoying to implement correctly. In fact, AFAICS, even the straightforward x86 implementation is incorrect (I suspect that it breaks if the vdso mapping is split or gets remapped). This adds a new vm_ops->name operation that can replace it. The followup patches will remove all uses of arch_vma_name on x86, fixing a couple of annoyances in the process. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2eee21791bb36a0a408c5c2bdb382a9e6a41ca4a.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf9811e1321a..63f8d4efe303 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -239,6 +239,12 @@ struct vm_operations_struct { */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy -- cgit v1.2.3 From a62c34bd2a8a3f159945becd57401e478818d51c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 19 May 2014 15:58:33 -0700 Subject: x86, mm: Improve _install_special_mapping and fix x86 vdso naming Using arch_vma_name to give special mappings a name is awkward. x86 currently implements it by comparing the start address of the vma to the expected address of the vdso. This requires tracking the start address of special mappings and is probably buggy if a special vma is split or moved. Improve _install_special_mapping to just name the vma directly. Use it to give the x86 vvar area a name, which should make CRIU's life easier. As a side effect, the vvar area will show up in core dumps. This could be considered weird and is fixable. [hpa: I say we accept this as-is but be prepared to deal with knocking out the vvars from core dumps if this becomes a problem.] Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/276b39b6b645fb11e345457b503f17b83c2c6fd0.1400538962.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vdso.h | 6 ++- arch/x86/mm/init_64.c | 3 -- arch/x86/vdso/vdso2c.h | 5 ++- arch/x86/vdso/vdso32-setup.c | 7 ---- arch/x86/vdso/vma.c | 25 ++++++++----- include/linux/mm.h | 4 +- include/linux/mm_types.h | 6 +++ mm/mmap.c | 89 +++++++++++++++++++++++++++++--------------- 8 files changed, 94 insertions(+), 51 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d0a2c909c72d..30be253dd283 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -7,10 +7,14 @@ #ifndef __ASSEMBLER__ +#include + struct vdso_image { void *data; unsigned long size; /* Always a multiple of PAGE_SIZE */ - struct page **pages; /* Big enough for data/size page pointers */ + + /* text_mapping.pages is big enough for data/size page pointers */ + struct vm_special_mapping text_mapping; unsigned long alt, alt_len; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6f881842116c..9deb59b0baea 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1223,9 +1223,6 @@ int in_gate_area_no_mm(unsigned long addr) const char *arch_vma_name(struct vm_area_struct *vma) { - if (vma->vm_mm && vma->vm_start == - (long __force)vma->vm_mm->context.vdso) - return "[vdso]"; if (vma == &gate_vma) return "[vsyscall]"; return NULL; diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index ed2e894e89ab..3dcc61e796e9 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -136,7 +136,10 @@ static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) fprintf(outfile, "const struct vdso_image %s = {\n", name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", data_size); - fprintf(outfile, "\t.pages = pages,\n"); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); if (alt_sec) { fprintf(outfile, "\t.alt = %lu,\n", (unsigned long)alt_sec->sh_offset); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index c3ed708e50f4..e4f7781ee162 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -119,13 +119,6 @@ __initcall(ia32_binfmt_init); #else /* CONFIG_X86_32 */ -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - return NULL; -} - struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 8ad0081df7a8..e1513c47872a 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -30,7 +30,8 @@ void __init init_vdso_image(const struct vdso_image *image) BUG_ON(image->size % PAGE_SIZE != 0); for (i = 0; i < npages; i++) - image->pages[i] = virt_to_page(image->data + i*PAGE_SIZE); + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + @@ -91,6 +92,10 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) unsigned long addr; int ret = 0; static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -112,21 +117,23 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) /* * MAYWRITE to allow gdb to COW and set breakpoints */ - ret = install_special_mapping(mm, - addr, - image->size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - image->pages); + vma = _install_special_mapping(mm, + addr, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); - if (ret) + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto up_fail; + } vma = _install_special_mapping(mm, addr + image->size, image->sym_end_mapping - image->size, VM_READ, - no_pages); + &vvar_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/include/linux/mm.h b/include/linux/mm.h index 63f8d4efe303..05aab09803e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1782,7 +1782,9 @@ extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); + unsigned long flags, + const struct vm_special_mapping *spec); +/* This is an obsolete alternative to _install_special_mapping. */ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..22c6f4e16d10 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -510,4 +510,10 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif +struct vm_special_mapping +{ + const char *name; + struct page **pages; +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf81f4b..52bbc9514d9d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2872,6 +2872,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2887,7 +2912,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2900,30 +2931,11 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; @@ -2940,8 +2952,8 @@ struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2958,12 +2970,31 @@ out: return ERR_PTR(ret); } +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { - struct vm_area_struct *vma = _install_special_mapping(mm, - addr, len, vm_flags, pages); + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); if (IS_ERR(vma)) return PTR_ERR(vma); -- cgit v1.2.3 From d2ee40eae98d8a41ff27dcdd13b1b656c4c1ad00 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:08:02 -0700 Subject: mm: introdule compound_head_by_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, in put_compound_page(), we have ====== if (likely(!PageTail(page))) { <------ (1) if (put_page_testzero(page)) { /* ¦* By the time all refcounts have been released ¦* split_huge_page cannot run anymore from under us. ¦*/ if (PageHead(page)) __put_compound_page(page); else __put_single_page(page); } return; } /* __split_huge_page_refcount can run under us */ page_head = compound_head(page); <------------ (2) ====== if at (1) , we fail the check, this means page is *likely* a tail page. Then at (2), as compoud_head(page) is inlined, it is : ====== static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) { <----------- (3) struct page *head = page->first_page; smp_rmb(); if (likely(PageTail(page))) return head; } return page; } ====== here, the (3) unlikely in the case is a negative hint, because it is *likely* a tail page. So the check (3) in this case is not good, so I introduce a helper for this case. So this patch introduces compound_head_by_tail() which deals with a possible tail page(though it could be spilt by a racy thread), and make compound_head() a wrapper on it. This patch has no functional change, and it reduces the object size slightly: text data bss dec hex filename 11003 1328 16 12347 303b mm/swap.o.orig 10971 1328 16 12315 301b mm/swap.o.patched I've ran "perf top -e branch-miss" to observe branch-miss in this case. As Michael points out, it's a slow path, so only very few times this case happens. But I grep'ed the code base, and found there still are some other call sites could be benifited from this helper. And given that it only bloating up the source by only 5 lines, but with a reduced object size. I still believe this helper deserves to exsit. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 29 +++++++++++++++++------------ mm/swap.c | 2 +- 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index d6777060449f..368600628d14 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -407,20 +407,25 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } +static inline struct page *compound_head_by_tail(struct page *tail) +{ + struct page *head = tail->first_page; + + /* + * page->first_page may be a dangling pointer to an old + * compound page, so recheck that it is still a tail + * page before returning. + */ + smp_rmb(); + if (likely(PageTail(tail))) + return head; + return tail; +} + static inline struct page *compound_head(struct page *page) { - if (unlikely(PageTail(page))) { - struct page *head = page->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(page))) - return head; - } + if (unlikely(PageTail(page))) + return compound_head_by_tail(page); return page; } diff --git a/mm/swap.c b/mm/swap.c index d089c5a0cf98..c8d6df556ce6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -253,7 +253,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head(page); + page_head = compound_head_by_tail(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else -- cgit v1.2.3