From 6879eb2deed7171a81b2f904c9ad14b9648689a7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:17 -0400 Subject: x86-64: Fix alignment of jiffies variable It's declared __attribute__((aligned(16)) but it's explicitly not aligned. This is probably harmless but it's a bit embarrassing. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/5f3bc5542e9aaa9382d53f153f54373165cdef89.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vvar.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 341b3559452b..a4eaca4a6133 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -45,7 +45,7 @@ /* DECLARE_VVAR(offset, type, name) */ DECLARE_VVAR(0, volatile unsigned long, jiffies) -DECLARE_VVAR(8, int, vgetcpu_mode) +DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR -- cgit v1.2.1 From 8b4777a4b50cb0c84c1152eac85d24415fb6ff7d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:18 -0400 Subject: x86-64: Document some of entry_64.S Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/fc134867cc550977cc996866129e11a16ba0f9ea.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8a445a0c989e..72c4a777bb91 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -9,6 +9,8 @@ /* * entry.S contains the system-call and fault low-level handling routines. * + * Some of this is documented in Documentation/x86/entry_64.txt + * * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * -- cgit v1.2.1 From 9fd67b4ed0714ab718f1f9bd14c344af336a6df7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:19 -0400 Subject: x86-64: Give vvars their own page Move vvars out of the vsyscall page into their own page and mark it NX. Without this patch, an attacker who can force a daemon to call some fixed address could wait until the time contains, say, 0xCD80, and then execute the current time. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/b1460f81dc4463d66ea3f2b5ce240f58d48effec.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/fixmap.h | 1 + arch/x86/include/asm/pgtable_types.h | 2 ++ arch/x86/include/asm/vvar.h | 22 ++++++++++------------ arch/x86/kernel/vmlinux.lds.S | 28 +++++++++++++++++----------- arch/x86/kernel/vsyscall_64.c | 5 +++++ 5 files changed, 35 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4729b2b63117..460c74e4852c 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -78,6 +78,7 @@ enum fixed_addresses { VSYSCALL_LAST_PAGE, VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, + VVAR_PAGE, VSYSCALL_HPET, #endif FIX_DBGP_BASE, diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d56187c6b838..6a29aed65902 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -108,6 +108,7 @@ #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) +#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -130,6 +131,7 @@ #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) #define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) +#define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index a4eaca4a6133..de656ac2af41 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -10,15 +10,14 @@ * In normal kernel code, they are used like any other variable. * In user code, they are accessed through the VVAR macro. * - * Each of these variables lives in the vsyscall page, and each - * one needs a unique offset within the little piece of the page - * reserved for vvars. Specify that offset in DECLARE_VVAR. - * (There are 896 bytes available. If you mess up, the linker will - * catch it.) + * These variables live in a page of kernel data that has an extra RO + * mapping for userspace. Each variable needs a unique offset within + * that page; specify that offset with the DECLARE_VVAR macro. (If + * you mess up, the linker will catch it.) */ -/* Offset of vars within vsyscall page */ -#define VSYSCALL_VARS_OFFSET (3072 + 128) +/* Base address of vvars. This is not ABI. */ +#define VVAR_ADDRESS (-10*1024*1024 - 4096) #if defined(__VVAR_KERNEL_LDS) @@ -26,17 +25,17 @@ * right place. */ #define DECLARE_VVAR(offset, type, name) \ - EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) + EMIT_VVAR(name, offset) #else #define DECLARE_VVAR(offset, type, name) \ static type const * const vvaraddr_ ## name = \ - (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); + (void *)(VVAR_ADDRESS + (offset)); #define DEFINE_VVAR(type, name) \ - type __vvar_ ## name \ - __attribute__((section(".vsyscall_var_" #name), aligned(16))) + type name \ + __attribute__((section(".vvar_" #name), aligned(16))) #define VVAR(name) (*vvaraddr_ ## name) @@ -49,4 +48,3 @@ DECLARE_VVAR(16, int, vgetcpu_mode) DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) #undef DECLARE_VVAR -#undef VSYSCALL_VARS_OFFSET diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 89aed99aafce..98b378dc7f86 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -161,12 +161,6 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) -#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ - ADDR(.vsyscall_0) + offset \ - : AT(VLOAD(.vsyscall_var_ ## x)) { \ - *(.vsyscall_var_ ## x) \ - } \ - x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; @@ -192,19 +186,31 @@ SECTIONS *(.vsyscall_3) } -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS - - . = __vsyscall_0 + PAGE_SIZE; + . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT + + __vvar_page = .; + + .vvar : AT(ADDR(.vvar) - LOAD_OFFSET) { + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) \ + . = offset; \ + *(.vvar_ ## name) +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS #undef EMIT_VVAR + } :data + + . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE); + #endif /* CONFIG_X86_64 */ /* Init code and data - will be freed after init */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3e682184d76c..3cf1cef75a6a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -284,9 +284,14 @@ void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + extern char __vvar_page; + unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != + (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) -- cgit v1.2.1 From 0d7b8547fb67d5c2a7d954c56b3715b0e708be4a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:20 -0400 Subject: x86-64: Remove kernel.vsyscall64 sysctl It's unnecessary overhead in code that's supposed to be highly optimized. Removing it allows us to remove one of the two syscall instructions in the vsyscall page. The only sensible use for it is for UML users, and it doesn't fully address inconsistent vsyscall results on UML. The real fix for UML is to stop using vsyscalls entirely. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/973ae803fe76f712da4b2740e66dccf452d3b1e4.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/vgtod.h | 1 - arch/x86/kernel/vsyscall_64.c | 34 +------------------------- arch/x86/vdso/vclock_gettime.c | 55 ++++++++++++++++-------------------------- 3 files changed, 22 insertions(+), 68 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 646b4c1ca695..aa5add855a91 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -11,7 +11,6 @@ struct vsyscall_gtod_data { time_t wall_time_sec; u32 wall_time_nsec; - int sysctl_enabled; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 3cf1cef75a6a..9b2f3f51bc91 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -53,7 +53,6 @@ DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), - .sysctl_enabled = 1, }; void update_vsyscall_tz(void) @@ -103,15 +102,6 @@ static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) return ret; } -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; @@ -122,8 +112,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); vread = VVAR(vsyscall_gtod_data).clock.vread; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || - !vread)) { + if (unlikely(!vread)) { gettimeofday(tv,NULL); return; } @@ -165,8 +154,6 @@ time_t __vsyscall(1) vtime(time_t *t) { unsigned seq; time_t result; - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) - return time_syscall(t); do { seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); @@ -227,22 +214,6 @@ static long __vsyscall(3) venosys_1(void) return -ENOSYS; } -#ifdef CONFIG_SYSCTL -static ctl_table kernel_table2[] = { - { .procname = "vsyscall64", - .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec }, - {} -}; - -static ctl_table kernel_root_table2[] = { - { .procname = "kernel", .mode = 0555, - .child = kernel_table2 }, - {} -}; -#endif - /* Assume __initcall executes before all user space. Hopefully kmod doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) @@ -301,9 +272,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); -#ifdef CONFIG_SYSCTL - register_sysctl_table(kernel_root_table2); -#endif on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ hotcpu_notifier(cpu_vsyscall_notifier, 30); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index a724905fdae7..cf54813ac527 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -116,21 +116,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { - if (likely(gtod->sysctl_enabled)) - switch (clock) { - case CLOCK_REALTIME: - if (likely(gtod->clock.vread)) - return do_realtime(ts); - break; - case CLOCK_MONOTONIC: - if (likely(gtod->clock.vread)) - return do_monotonic(ts); - break; - case CLOCK_REALTIME_COARSE: - return do_realtime_coarse(ts); - case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); - } + switch (clock) { + case CLOCK_REALTIME: + if (likely(gtod->clock.vread)) + return do_realtime(ts); + break; + case CLOCK_MONOTONIC: + if (likely(gtod->clock.vread)) + return do_monotonic(ts); + break; + case CLOCK_REALTIME_COARSE: + return do_realtime_coarse(ts); + case CLOCK_MONOTONIC_COARSE: + return do_monotonic_coarse(ts); + } + return vdso_fallback_gettime(clock, ts); } int clock_gettime(clockid_t, struct timespec *) @@ -139,7 +139,7 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; - if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { + if (likely(gtod->clock.vread)) { if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || @@ -161,27 +161,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ - -static __always_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); - return secs; -} - +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ notrace time_t __vdso_time(time_t *t) { - time_t result; - - if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) - return time_syscall(t); - /* This is atomic on x86_64 so we don't need any locks. */ - result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); if (t) *t = result; -- cgit v1.2.1 From d319bb79afa4039bda6f85661d6bf0c13299ce93 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:21 -0400 Subject: x86-64: Map the HPET NX Currently the HPET mapping is a user-accessible syscall instruction at a fixed address some of the time. A sufficiently determined hacker might be able to guess when. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/ab41b525a4ca346b1ca1145d16fb8d181861a8aa.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 4 ++-- arch/x86/kernel/hpet.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 6a29aed65902..013286a10c2c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -107,8 +107,8 @@ #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD) #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) -#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) +#define __PAGE_KERNEL_VVAR_NOCACHE (__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -130,8 +130,8 @@ #define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) +#define PAGE_KERNEL_VVAR_NOCACHE __pgprot(__PAGE_KERNEL_VVAR_NOCACHE) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 6781765b3a0d..e9f5605e4748 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -71,7 +71,7 @@ static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); #ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); #endif } -- cgit v1.2.1 From bb5fe2f78eadf5a52d8dcbf9a57728fd107af97b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:22 -0400 Subject: x86-64: Remove vsyscall number 3 (venosys) It just segfaults since April 2008 (a4928cff), so I'm pretty sure that nothing uses it. And having an empty section makes the linker script a bit fragile. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/4a4abcf47ecadc269f2391a313576fe6d06acef7.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 4 ---- arch/x86/kernel/vsyscall_64.c | 5 ----- 2 files changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 98b378dc7f86..4f90082fd640 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -182,10 +182,6 @@ SECTIONS *(.vsyscall_2) } - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9b2f3f51bc91..70a5f6eebd6c 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -209,11 +209,6 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) return 0; } -static long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; -} - /* Assume __initcall executes before all user space. Hopefully kmod doesn't violate that. We'll find out if it does. */ static void __cpuinit vsyscall_set_cpu(int cpu) -- cgit v1.2.1 From 5dfcea629a08b4684a019cd0cb59d0c9129a6c02 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:23 -0400 Subject: x86-64: Fill unused parts of the vsyscall page with 0xcc Jumping to 0x00 might do something depending on the following bytes. Jumping to 0xcc is a trap. So fill the unused parts of the vsyscall page with 0xcc to make it useless for exploits to jump there. Signed-off-by: Andy Lutomirski Cc: Jesper Juhl Cc: Borislav Petkov Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/ed54bfcfbe50a9070d20ec1edbe0d149e22a4568.1307292171.git.luto@mit.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4f90082fd640..80174719910c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -166,22 +166,20 @@ SECTIONS __vsyscall_0 = .; . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { + .vsyscall : AT(VLOAD(.vsyscall)) { *(.vsyscall_0) - } :user - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + . = ALIGN(L1_CACHE_BYTES); *(.vsyscall_fn) - } - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + . = 1024; *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + + . = 2048; *(.vsyscall_2) - } + . = 4096; /* Pad the whole page. */ + } :user =0xcc . = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE); #undef VSYSCALL_ADDR -- cgit v1.2.1 From 5cec93c216db77c45f7ce970d46283bcb1933884 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sun, 5 Jun 2011 13:50:24 -0400 Subject: x86-64: Emulate legacy vsyscalls There's a fair amount of code in the vsyscall page. It contains a syscall instruction (in the gettimeofday fallback) and who knows what will happen if an exploit jumps into the middle of some other code. Reduce the risk by replacing the vsyscalls with short magic incantations that cause the kernel to emulate the real vsyscalls. These incantations are useless if entered in the middle. This causes vsyscalls to be a little more expensive than real syscalls. Fortunately sensible programs don't use them. The only exception is time() which is still called by glibc through the vsyscall - but calling time() millions of times per second is not sensible. glibc has this fixed in the development tree. This patch is not perfect: the vread_tsc and vread_hpet functions are still at a fixed address. Fixing that might involve making alternative patching work in the vDSO. Signed-off-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Jesper Juhl Cc: Borislav Petkov Cc: Arjan van de Ven Cc: Jan Beulich Cc: richard -rw- weinberger Cc: Mikael Pettersson Cc: Andi Kleen Cc: Brian Gerst Cc: Louis Rilling Cc: Valdis.Kletnieks@vt.edu Cc: pageexec@freemail.hu Link: http://lkml.kernel.org/r/e64e1b3c64858820d12c48fa739efbd1485e79d5.1307292171.git.luto@mit.edu [ Removed the CONFIG option - it's simpler to just do it unconditionally. Tidied up the code as well. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 6 +- arch/x86/include/asm/traps.h | 4 + arch/x86/include/asm/vsyscall.h | 12 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/entry_64.S | 2 + arch/x86/kernel/traps.c | 6 + arch/x86/kernel/vsyscall_64.c | 261 +++++++++++++++++-------------------- arch/x86/kernel/vsyscall_emu_64.S | 27 ++++ 8 files changed, 179 insertions(+), 140 deletions(-) create mode 100644 arch/x86/kernel/vsyscall_emu_64.S (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 6e976ee3b3ef..a563c509edcb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -17,7 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts + * Vector 204 : legacy x86_64 vsyscall emulation + * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. @@ -50,6 +51,9 @@ #ifdef CONFIG_X86_32 # define SYSCALL_VECTOR 0x80 #endif +#ifdef CONFIG_X86_64 +# define VSYSCALL_EMU_VECTOR 0xcc +#endif /* * Vectors 0x30-0x3f are used for ISA interrupts. diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0310da67307f..2bae0a513b40 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include + #include #include /* TRAP_TRACE, ... */ @@ -38,6 +40,7 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); #endif /* CONFIG_X86_MCE */ asmlinkage void simd_coprocessor_error(void); +asmlinkage void emulate_vsyscall(void); dotraplinkage void do_divide_error(struct pt_regs *, long); dotraplinkage void do_debug(struct pt_regs *, long); @@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long); dotraplinkage void do_machine_check(struct pt_regs *, long); #endif dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long); +dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long); #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *, long); #endif diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d55597351f6a..bb710cb0cdc1 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -31,6 +31,18 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); +/* Emulation */ + +static inline bool is_vsyscall_entry(unsigned long addr) +{ + return (addr & ~0xC00UL) == VSYSCALL_START; +} + +static inline int vsyscall_entry_nr(unsigned long addr) +{ + return (addr & 0xC00UL) >> 10; +} + #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90b06d4daee2..cc0469a65120 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o +obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 72c4a777bb91..e949793d6b93 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1123,6 +1123,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug zeroentry coprocessor_error do_coprocessor_error errorentry alignment_check do_alignment_check zeroentry simd_coprocessor_error do_simd_coprocessor_error +zeroentry emulate_vsyscall do_emulate_vsyscall + /* Reload gs selector with exception handling */ /* edi: new selector */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b9b67166f9de..fbc097a085ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -872,6 +872,12 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); #endif +#ifdef CONFIG_X86_64 + BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors)); + set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall); + set_bit(VSYSCALL_EMU_VECTOR, used_vectors); +#endif + /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 70a5f6eebd6c..10cd8ac3395a 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -2,6 +2,8 @@ * Copyright (C) 2001 Andrea Arcangeli SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * + * [ NOTE: this mechanism is now deprecated in favor of the vDSO. ] + * * Thanks to hpa@transmeta.com for some useful hint. * Special thanks to Ingo Molnar for his early experience with * a different vsyscall implementation for Linux/IA32 and for the name. @@ -11,10 +13,9 @@ * vsyscalls. One vsyscall can reserve more than 1 slot to avoid * jumping out of line if necessary. We cannot add more with this * mechanism because older kernels won't return -ENOSYS. - * If we want more than four we need a vDSO. * - * Note: the concept clashes with user mode linux. If you use UML and - * want per guest time just set the kernel.vsyscall64 sysctl to 0. + * Note: the concept clashes with user mode linux. UML users should + * use the vDSO. */ /* Disable profiling for userspace code: */ @@ -32,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -44,10 +47,7 @@ #include #include #include - -#define __vsyscall(nr) \ - __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace -#define __syscall_clobber "r11","cx","memory" +#include DEFINE_VVAR(int, vgetcpu_mode); DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = @@ -71,146 +71,129 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, unsigned long flags; write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; - vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; - vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = mult; - vsyscall_gtod_data.clock.shift = clock->shift; - vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; - vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = *wtm; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -/* RED-PEN may want to readd seq locking, but then the variable should be - * write-once. - */ -static __always_inline void do_get_tz(struct timezone * tz) +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) { - *tz = VVAR(vsyscall_gtod_data).sys_tz; -} + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + struct task_struct *tsk; -static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) - : __syscall_clobber ); - return ret; -} + if (!show_unhandled_signals || !__ratelimit(&rs)) + return; -static __always_inline void do_vgettimeofday(struct timeval * tv) -{ - cycle_t now, base, mask, cycle_delta; - unsigned seq; - unsigned long mult, shift, nsec; - cycle_t (*vread)(void); - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - - vread = VVAR(vsyscall_gtod_data).clock.vread; - if (unlikely(!vread)) { - gettimeofday(tv,NULL); - return; - } - - now = vread(); - base = VVAR(vsyscall_gtod_data).clock.cycle_last; - mask = VVAR(vsyscall_gtod_data).clock.mask; - mult = VVAR(vsyscall_gtod_data).clock.mult; - shift = VVAR(vsyscall_gtod_data).clock.shift; - - tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; - nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); - - /* calculate interval: */ - cycle_delta = (now - base) & mask; - /* convert to nsecs: */ - nsec += (cycle_delta * mult) >> shift; - - while (nsec >= NSEC_PER_SEC) { - tv->tv_sec += 1; - nsec -= NSEC_PER_SEC; - } - tv->tv_usec = nsec / NSEC_PER_USEC; -} + tsk = current; -int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; + printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, tsk->comm, task_pid_nr(tsk), + message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di); } -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -time_t __vsyscall(1) vtime(time_t *t) +void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) { - unsigned seq; - time_t result; + const char *vsyscall_name; + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr; + long ret; + + /* Kernel code must never get here. */ + BUG_ON(!user_mode(regs)); + + local_irq_enable(); + + /* + * x86-ism here: regs->ip points to the instruction after the int 0xcc, + * and int 0xcc is two bytes long. + */ + if (!is_vsyscall_entry(regs->ip - 2)) { + warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); + goto sigsegv; + } + vsyscall_nr = vsyscall_entry_nr(regs->ip - 2); - do { - seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); + goto sigsegv; + } - result = VVAR(vsyscall_gtod_data).wall_time_sec; + tsk = current; + if (seccomp_mode(&tsk->seccomp)) + do_exit(SIGKILL); + + switch (vsyscall_nr) { + case 0: + vsyscall_name = "gettimeofday"; + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + vsyscall_name = "time"; + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + vsyscall_name = "getcpu"; + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + 0); + break; + + default: + /* + * If we get here, then vsyscall_nr indicates that int 0xcc + * happened at an address in the vsyscall page that doesn't + * contain int 0xcc. That can't happen. + */ + BUG(); + } - } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); + if (ret == -EFAULT) { + /* + * Bad news -- userspace fed a bad pointer to a vsyscall. + * + * With a real vsyscall, that would have caused SIGSEGV. + * To make writing reliable exploits using the emulated + * vsyscalls harder, generate SIGSEGV here as well. + */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + goto sigsegv; + } - if (t) - *t = result; - return result; -} + regs->ax = ret; -/* Fast way to get current CPU and node. - This helps to do per node and per CPU caches in user space. - The result is not guaranteed without CPU affinity, but usually - works out because the scheduler tries to keep a thread on the same - CPU. + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; - tcache must point to a two element sized long array. - All arguments can be NULL. */ -long __vsyscall(2) -vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) -{ - unsigned int p; - unsigned long j = 0; - - /* Fast cache - only recompute value once per jiffies and avoid - relatively costly rdtscp/cpuid otherwise. - This works because the scheduler usually keeps the process - on the same CPU and this syscall doesn't guarantee its - results anyways. - We do this here because otherwise user space would do it on - its own in a likely inferior way (no access to jiffies). - If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { - p = tcache->blob[1]; - } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { - /* Load per CPU data from RDTSCP */ - native_read_tscp(&p); - } else { - /* Load per CPU data from GDT */ - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); - } - if (tcache) { - tcache->blob[0] = j; - tcache->blob[1] = p; - } - if (cpu) - *cpu = p & 0xfff; - if (node) - *node = p >> 12; - return 0; + local_irq_disable(); + return; + +sigsegv: + regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ + force_sig(SIGSEGV, current); } -/* Assume __initcall executes before all user space. Hopefully kmod - doesn't violate that. We'll find out if it does. */ +/* + * Assume __initcall executes before all user space. Hopefully kmod + * doesn't violate that. We'll find out if it does. + */ static void __cpuinit vsyscall_set_cpu(int cpu) { unsigned long d; @@ -221,13 +204,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu) if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) write_rdtscp_aux((node << 12) | cpu); - /* Store cpu number in limit so that it can be loaded quickly - in user space in vgetcpu. - 12 bits for the CPU and 8 bits for the node. */ + /* + * Store cpu number in limit so that it can be loaded quickly + * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node) + */ d = 0x0f40000000000ULL; d |= cpu; d |= (node & 0xf) << 12; d |= (node >> 4) << 48; + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } @@ -241,8 +226,10 @@ static int __cpuinit cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) { long cpu = (long)arg; + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1); + return NOTIFY_DONE; } @@ -256,21 +243,17 @@ void __init map_vsyscall(void) /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != - (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS); } static int __init vsyscall_init(void) { - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); + BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); + on_each_cpu(cpu_vsyscall_init, NULL, 1); /* notifier priority > KVM */ hotcpu_notifier(cpu_vsyscall_notifier, 30); + return 0; } - __initcall(vsyscall_init); diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S new file mode 100644 index 000000000000..ffa845eae5ca --- /dev/null +++ b/arch/x86/kernel/vsyscall_emu_64.S @@ -0,0 +1,27 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include +#include + +/* The unused parts of the page are filled with 0xcc by the linker script. */ + +.section .vsyscall_0, "a" +ENTRY(vsyscall_0) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_0) + +.section .vsyscall_1, "a" +ENTRY(vsyscall_1) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_1) + +.section .vsyscall_2, "a" +ENTRY(vsyscall_2) + int $VSYSCALL_EMU_VECTOR +END(vsyscall_2) -- cgit v1.2.1 From c9712944b2a12373cb6ff8059afcfb7e826a6c54 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:09 -0400 Subject: x86-64: Improve vsyscall emulation CS and RIP handling Three fixes here: - Send SIGSEGV if called from compat code or with a funny CS. - Don't BUG on impossible addresses. - Add a missing local_irq_disable. This patch also removes an unused variable. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/6fb2b13ab39b743d1e4f466eef13425854912f7f.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/vsyscall.h | 12 -------- arch/x86/kernel/vsyscall_64.c | 61 +++++++++++++++++++++++++++-------------- 2 files changed, 41 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index bb710cb0cdc1..d55597351f6a 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -31,18 +31,6 @@ extern struct timezone sys_tz; extern void map_vsyscall(void); -/* Emulation */ - -static inline bool is_vsyscall_entry(unsigned long addr) -{ - return (addr & ~0xC00UL) == VSYSCALL_START; -} - -static inline int vsyscall_entry_nr(unsigned long addr) -{ - return (addr & 0xC00UL) >> 10; -} - #endif /* __KERNEL__ */ #endif /* _ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 10cd8ac3395a..a262400c3479 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -97,33 +98,63 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, tsk = current; - printk("%s%s[%d] %s ip:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", level, tsk->comm, task_pid_nr(tsk), - message, regs->ip - 2, regs->sp, regs->ax, regs->si, regs->di); + message, regs->ip - 2, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_START) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; } void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) { - const char *vsyscall_name; struct task_struct *tsk; unsigned long caller; int vsyscall_nr; long ret; - /* Kernel code must never get here. */ - BUG_ON(!user_mode(regs)); - local_irq_enable(); + /* + * Real 64-bit user mode code has cs == __USER_CS. Anything else + * is bogus. + */ + if (regs->cs != __USER_CS) { + /* + * If we trapped from kernel mode, we might as well OOPS now + * instead of returning to some random address and OOPSing + * then. + */ + BUG_ON(!user_mode(regs)); + + /* Compat mode and non-compat 32-bit CS should both segfault. */ + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc from 32-bit mode"); + goto sigsegv; + } + /* * x86-ism here: regs->ip points to the instruction after the int 0xcc, * and int 0xcc is two bytes long. */ - if (!is_vsyscall_entry(regs->ip - 2)) { - warn_bad_vsyscall(KERN_WARNING, regs, "illegal int 0xcc (exploit attempt?)"); + vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2); + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "illegal int 0xcc (exploit attempt?)"); goto sigsegv; } - vsyscall_nr = vsyscall_entry_nr(regs->ip - 2); if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)"); @@ -136,31 +167,20 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) switch (vsyscall_nr) { case 0: - vsyscall_name = "gettimeofday"; ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: - vsyscall_name = "time"; ret = sys_time((time_t __user *)regs->di); break; case 2: - vsyscall_name = "getcpu"; ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; - - default: - /* - * If we get here, then vsyscall_nr indicates that int 0xcc - * happened at an address in the vsyscall page that doesn't - * contain int 0xcc. That can't happen. - */ - BUG(); } if (ret == -EFAULT) { @@ -188,6 +208,7 @@ void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code) sigsegv: regs->ip -= 2; /* The faulting instruction should be the int 0xcc. */ force_sig(SIGSEGV, current); + local_irq_disable(); } /* -- cgit v1.2.1 From 59e97e4d6fbcd5b74a94cb48bcbfc6f8478a5e93 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:10 -0400 Subject: x86: Make alternative instruction pointers relative This save a few bytes on x86-64 and means that future patches can apply alternatives to unrelocated code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/ff64a6b9a1a3860ca4a7b8b6dc7b4754f9491cd7.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative-asm.h | 4 ++-- arch/x86/include/asm/alternative.h | 8 ++++---- arch/x86/include/asm/cpufeature.h | 8 ++++---- arch/x86/kernel/alternative.c | 21 +++++++++++++-------- arch/x86/lib/copy_page_64.S | 9 +++------ arch/x86/lib/memmove_64.S | 11 +++++------ 6 files changed, 31 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 94d420b360d1..4554cc6fb96a 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -17,8 +17,8 @@ .macro altinstruction_entry orig alt feature orig_len alt_len .align 8 - .quad \orig - .quad \alt + .long \orig - . + .long \alt - . .word \feature .byte \orig_len .byte \alt_len diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index bf535f947e8c..23fb6d79f209 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -43,8 +43,8 @@ #endif struct alt_instr { - u8 *instr; /* original instruction */ - u8 *replacement; + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ @@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end) "661:\n\t" oldinstr "\n662:\n" \ ".section .altinstructions,\"a\"\n" \ _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ + " .long 661b - .\n" /* label */ \ + " .long 663f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 71cc3800712c..9929b35929ff 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -331,8 +331,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) "2:\n" ".section .altinstructions,\"a\"\n" _ASM_ALIGN "\n" - _ASM_PTR "1b\n" - _ASM_PTR "0\n" /* no replacement */ + " .long 1b - .\n" + " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ @@ -349,8 +349,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) "2:\n" ".section .altinstructions,\"a\"\n" _ASM_ALIGN "\n" - _ASM_PTR "1b\n" - _ASM_PTR "3f\n" + " .long 1b - .\n" + " .long 3f - .\n" " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a81f2d52f869..ddb207bb5f91 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -263,6 +263,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; + u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); @@ -276,25 +277,29 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - u8 *instr = a->instr; + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; + + memcpy(insnbuf, replacement, a->replacementlen); + + /* 0xe8 is a relative jump; fix the offset. */ + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += replacement - instr; + + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + #ifdef CONFIG_X86_64 /* vsyscall code is not mapped yet. resolve it manually. */ if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - DPRINTK("%s: vsyscall fixup: %p => %p\n", - __func__, a->instr, instr); } #endif - memcpy(insnbuf, a->replacement, a->replacementlen); - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += a->replacement - a->instr; - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); } } diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 6fec2d1cebe1..01c805ba5359 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,6 +2,7 @@ #include #include +#include ALIGN copy_page_c: @@ -110,10 +111,6 @@ ENDPROC(copy_page) 2: .previous .section .altinstructions,"a" - .align 8 - .quad copy_page - .quad 1b - .word X86_FEATURE_REP_GOOD - .byte .Lcopy_page_end - copy_page - .byte 2b - 1b + altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ + .Lcopy_page_end-copy_page, 2b-1b .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index d0ec9c2936d7..ee164610ec46 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -9,6 +9,7 @@ #include #include #include +#include #undef memmove @@ -214,11 +215,9 @@ ENTRY(memmove) .previous .section .altinstructions,"a" - .align 8 - .quad .Lmemmove_begin_forward - .quad .Lmemmove_begin_forward_efs - .word X86_FEATURE_ERMS - .byte .Lmemmove_end_forward-.Lmemmove_begin_forward - .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + altinstruction_entry .Lmemmove_begin_forward, \ + .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ + .Lmemmove_end_forward-.Lmemmove_begin_forward, \ + .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs .previous ENDPROC(memmove) -- cgit v1.2.1 From 1b3f2a72bbcfdf92e368a44448c45eb639b05b5e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:11 -0400 Subject: x86-64: Allow alternative patching in the vDSO This code is short enough and different enough from the module loader that it's not worth trying to share anything. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e73112e4381fff29e31b882c2d0856822edaea53.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7abd2be0f9b9..c39938d1332f 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -23,11 +23,44 @@ extern unsigned short vdso_sync_cpuid; static struct page **vdso_pages; static unsigned vdso_size; +static void __init patch_vdso(void *vdso, size_t len) +{ + Elf64_Ehdr *hdr = vdso; + Elf64_Shdr *sechdrs, *alt_sec = 0; + char *secstrings; + void *alt_data; + int i; + + BUG_ON(len < sizeof(Elf64_Ehdr)); + BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); + + sechdrs = (void *)hdr + hdr->e_shoff; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (i = 1; i < hdr->e_shnum; i++) { + Elf64_Shdr *shdr = &sechdrs[i]; + if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { + alt_sec = shdr; + goto found; + } + } + + /* If we get here, it's probably a bug. */ + pr_warning("patch_vdso: .altinstructions not found\n"); + return; /* nothing to patch */ + +found: + alt_data = (void *)hdr + alt_sec->sh_offset; + apply_alternatives(alt_data, alt_data + alt_sec->sh_size); +} + static int __init init_vdso_vars(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; + patch_vdso(vdso_start, vdso_end - vdso_start); + vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); if (!vdso_pages) -- cgit v1.2.1 From 7f79ad15f33cf4968cafb0e3d2beba427de01d3a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:12 -0400 Subject: x86-64: Add --no-undefined to vDSO build This gives much nicer diagnostics when something goes wrong. It's supported at least as far back as binutils 2.15. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/de0b50920469ff6359c529526e7639fdd36fa83c.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index bef0bc962400..5d179502a52c 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so -- cgit v1.2.1 From 433bd805e5fd2c731b3a9025b034f066272d336e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 13 Jul 2011 09:24:13 -0400 Subject: clocksource: Replace vread with generic arch data The vread field was bloating struct clocksource everywhere except x86_64, and I want to change the way this works on x86_64, so let's split it out into per-arch data. Cc: x86@kernel.org Cc: Clemens Ladisch Cc: linux-ia64@vger.kernel.org Cc: Tony Luck Cc: Fenghua Yu Cc: John Stultz Cc: Thomas Gleixner Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/3ae5ec76a168eaaae63f08a2a1060b91aa0b7759.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/clocksource.h | 16 ++++++++++++++++ arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/vsyscall_64.c | 2 +- 4 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 arch/x86/include/asm/clocksource.h (limited to 'arch/x86') diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h new file mode 100644 index 000000000000..a5df33f614c9 --- /dev/null +++ b/arch/x86/include/asm/clocksource.h @@ -0,0 +1,16 @@ +/* x86-specific clocksource additions */ + +#ifndef _ASM_X86_CLOCKSOURCE_H +#define _ASM_X86_CLOCKSOURCE_H + +#ifdef CONFIG_X86_64 + +#define __ARCH_HAS_CLOCKSOURCE_DATA + +struct arch_clocksource_data { + cycle_t (*vread)(void); +}; + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_CLOCKSOURCE_H */ diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index e9f5605e4748..0e07257bb389 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -753,7 +753,7 @@ static struct clocksource clocksource_hpet = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 - .vread = vread_hpet, + .archdata = { .vread = vread_hpet }, #endif }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6cc6922262af..e7a74b889ab3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 - .vread = vread_tsc, + .archdata = { .vread = vread_tsc }, #endif }; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index a262400c3479..12d488fd95d9 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -74,7 +74,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.vread = clock->archdata.vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; -- cgit v1.2.1 From 98d0ac38ca7b1b7a552c9a2359174ff84decb600 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2011 06:47:22 -0400 Subject: x86-64: Move vread_tsc and vread_hpet into the vDSO The vsyscall page now consists entirely of trap instructions. Cc: John Stultz Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/637648f303f2ef93af93bae25186e9a1bea093f5.1310639973.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/clocksource.h | 6 ++++- arch/x86/include/asm/tsc.h | 4 --- arch/x86/include/asm/vgtod.h | 2 +- arch/x86/include/asm/vsyscall.h | 4 --- arch/x86/kernel/Makefile | 7 +---- arch/x86/kernel/alternative.c | 8 ------ arch/x86/kernel/hpet.c | 9 +------ arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 3 --- arch/x86/kernel/vread_tsc_64.c | 36 -------------------------- arch/x86/kernel/vsyscall_64.c | 2 +- arch/x86/vdso/vclock_gettime.c | 53 +++++++++++++++++++++++++++++++++----- 12 files changed, 57 insertions(+), 79 deletions(-) delete mode 100644 arch/x86/kernel/vread_tsc_64.c (limited to 'arch/x86') diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index a5df33f614c9..3882c65dc19b 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -7,8 +7,12 @@ #define __ARCH_HAS_CLOCKSOURCE_DATA +#define VCLOCK_NONE 0 /* No vDSO clock available. */ +#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ +#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ + struct arch_clocksource_data { - cycle_t (*vread)(void); + int vclock_mode; }; #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 9db5583b6d38..83e2efd181e2 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); -#ifdef CONFIG_X86_64 -extern cycles_t vread_tsc(void); -#endif - /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index aa5add855a91..815285bcaceb 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -13,7 +13,7 @@ struct vsyscall_gtod_data { struct timezone sys_tz; struct { /* extract of a clocksource struct */ - cycle_t (*vread)(void); + int vclock_mode; cycle_t cycle_last; cycle_t mask; u32 mult; diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d55597351f6a..60107072c28b 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -16,10 +16,6 @@ enum vsyscall_num { #ifdef __KERNEL__ #include -/* Definitions for CONFIG_GENERIC_TIME definitions */ -#define __vsyscall_fn \ - __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace - #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cc0469a65120..2deef3d2435a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,17 +24,12 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n -GCOV_PROFILE_vread_tsc_64.o := n GCOV_PROFILE_paravirt.o := n -# vread_tsc_64 is hot and should be fully optimized: -CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls - obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o @@ -43,7 +38,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ddb207bb5f91..c63822816249 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -extern char __vsyscall_0; void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -294,12 +292,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start, add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); -#ifdef CONFIG_X86_64 - /* vsyscall code is not mapped yet. resolve it manually. */ - if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { - instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - } -#endif text_poke_early(instr, insnbuf, a->instrlen); } } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0e07257bb389..d10cc009845f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -738,13 +738,6 @@ static cycle_t read_hpet(struct clocksource *cs) return (cycle_t)hpet_readl(HPET_COUNTER); } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_hpet(void) -{ - return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); -} -#endif - static struct clocksource clocksource_hpet = { .name = "hpet", .rating = 250, @@ -753,7 +746,7 @@ static struct clocksource clocksource_hpet = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 - .archdata = { .vread = vread_hpet }, + .archdata = { .vclock_mode = VCLOCK_HPET }, #endif }; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e7a74b889ab3..56c633a5db72 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -777,7 +777,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 - .archdata = { .vread = vread_tsc }, + .archdata = { .vclock_mode = VCLOCK_TSC }, #endif }; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 80174719910c..4aa9c54a9b76 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -169,9 +169,6 @@ SECTIONS .vsyscall : AT(VLOAD(.vsyscall)) { *(.vsyscall_0) - . = ALIGN(L1_CACHE_BYTES); - *(.vsyscall_fn) - . = 1024; *(.vsyscall_1) diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c deleted file mode 100644 index a81aa9e9894c..000000000000 --- a/arch/x86/kernel/vread_tsc_64.c +++ /dev/null @@ -1,36 +0,0 @@ -/* This code runs in userspace. */ - -#define DISABLE_BRANCH_PROFILING -#include - -notrace cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - - last = VVAR(vsyscall_gtod_data).clock.cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 12d488fd95d9..dda7dff9cef7 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -74,7 +74,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock.vread = clock->archdata.vread; + vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index cf54813ac527..8792d6e0a2c3 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,43 @@ #define gtod (&VVAR(vsyscall_gtod_data)) +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static notrace cycle_t vread_hpet(void) +{ + return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -36,9 +74,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) notrace static inline long vgetns(void) { long v; - cycles_t (*vread)(void); - vread = gtod->clock.vread; - v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; + cycles_t cycles; + if (gtod->clock.vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); + else + cycles = vread_hpet(); + v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; return (v * gtod->clock.mult) >> gtod->clock.shift; } @@ -118,11 +159,11 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { switch (clock) { case CLOCK_REALTIME: - if (likely(gtod->clock.vread)) + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) return do_realtime(ts); break; case CLOCK_MONOTONIC: - if (likely(gtod->clock.vread)) + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) return do_monotonic(ts); break; case CLOCK_REALTIME_COARSE: @@ -139,7 +180,7 @@ int clock_gettime(clockid_t, struct timespec *) notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; - if (likely(gtod->clock.vread)) { + if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) { if (likely(tv != NULL)) { BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != offsetof(struct timespec, tv_nsec) || -- cgit v1.2.1 From 8c400f6ce068366bc3517f1036bb99169cfec9cd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 18 Jul 2011 21:10:54 +0200 Subject: x86, vdso: Drop now wrong comment Now that 1b3f2a72bbcfdf92e368a44448c45eb639b05b5e is in, it is very important that the below lying comment be removed! :-) Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/20110718191054.GA18359@liondog.tnic Acked-by: Andy Lutomirski Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vclock_gettime.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 8792d6e0a2c3..6bc0e723b6e8 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -6,7 +6,6 @@ * * The code should have no internal unresolved relocations. * Check with readelf after changing. - * Also alternative() doesn't work. */ /* Disable profiling for userspace code: */ -- cgit v1.2.1 From ae7bd11b471931752e5609094ca0a49386590524 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 21 Jul 2011 13:34:05 -0700 Subject: clocksource: Change __ARCH_HAS_CLOCKSOURCE_DATA to a CONFIG option The machinery for __ARCH_HAS_CLOCKSOURCE_DATA assumed a file in asm-generic would be the default for architectures without their own file in asm/, but that is not how it works. Replace it with a Kconfig option instead. Link: http://lkml.kernel.org/r/4E288AA6.7090804@zytor.com Signed-off-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Tony Luck --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/clocksource.h | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index da349723d411..c1e41bccdcb8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,10 @@ config CLOCKSOURCE_WATCHDOG config GENERIC_CLOCKEVENTS def_bool y +config ARCH_CLOCKSOURCE_DATA + def_bool y + depends on X86_64 + config GENERIC_CLOCKEVENTS_BROADCAST def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h index 3882c65dc19b..0bdbbb3b9ce7 100644 --- a/arch/x86/include/asm/clocksource.h +++ b/arch/x86/include/asm/clocksource.h @@ -5,8 +5,6 @@ #ifdef CONFIG_X86_64 -#define __ARCH_HAS_CLOCKSOURCE_DATA - #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ -- cgit v1.2.1 From aafade242ff24fac3aabf61c7861dfa44a3c2445 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 21 Jul 2011 15:47:10 -0400 Subject: x86-64, vdso: Do not allocate memory for the vDSO We can map the vDSO straight from kernel data, saving a few page allocations. As an added bonus, the deleted code contained a memory leak. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2c4ed5c2c2e93603790229e0c3403ae506ccc0cb.1311277573.git.luto@mit.edu Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso.S | 15 +++++++++++++-- arch/x86/vdso/vma.c | 25 ++++++------------------- 2 files changed, 19 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 1d3aa6b87181..1b979c12ba85 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,10 +1,21 @@ +#include +#include #include -__INITDATA +__PAGE_ALIGNED_DATA .globl vdso_start, vdso_end + .align PAGE_SIZE vdso_start: .incbin "arch/x86/vdso/vdso.so" vdso_end: -__FINIT +.previous + + .globl vdso_pages + .bss + .align 8 + .type vdso_pages, @object +vdso_pages: + .zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8 + .size vdso_pages, .-vdso_pages diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index c39938d1332f..316fbca3490e 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -14,13 +14,14 @@ #include #include #include +#include unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; -static struct page **vdso_pages; +extern struct page *vdso_pages[]; static unsigned vdso_size; static void __init patch_vdso(void *vdso, size_t len) @@ -54,7 +55,7 @@ found: apply_alternatives(alt_data, alt_data + alt_sec->sh_size); } -static int __init init_vdso_vars(void) +static int __init init_vdso(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; @@ -62,26 +63,12 @@ static int __init init_vdso_vars(void) patch_vdso(vdso_start, vdso_end - vdso_start); vdso_size = npages << PAGE_SHIFT; - vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); - if (!vdso_pages) - goto oom; - for (i = 0; i < npages; i++) { - struct page *p; - p = alloc_page(GFP_KERNEL); - if (!p) - goto oom; - vdso_pages[i] = p; - copy_page(page_address(p), vdso_start + i*PAGE_SIZE); - } + for (i = 0; i < npages; i++) + vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); return 0; - - oom: - printk("Cannot allocate vdso\n"); - vdso_enabled = 0; - return -ENOMEM; } -subsys_initcall(init_vdso_vars); +subsys_initcall(init_vdso); struct linux_binprm; -- cgit v1.2.1