From 0a15584d72760a3b83d97af85d37ffaa2c42068d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Apr 2015 16:10:07 -0700 Subject: x86, selftests: Add single_step_syscall test This is a very simple test that makes system calls with TF set. This test currently fails when running the 32-bit build on a 64-bit kernel on an Intel CPU. This bug will be fixed by the next commit. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Oleg Nesterov Cc: Shuah Khan Link: http://lkml.kernel.org/r/20e68021155f6ab5c60590dcad81d37c68ea2c4f.1429139075.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/run_x86_tests.sh | 2 + tools/testing/selftests/x86/single_step_syscall.c | 181 ++++++++++++++++++++++ 3 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/single_step_syscall.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index f0a7918178dd..ddf63569df5a 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -1,6 +1,6 @@ .PHONY: all all_32 all_64 check_build32 clean run_tests -TARGETS_C_BOTHBITS := sigreturn +TARGETS_C_BOTHBITS := sigreturn single_step_syscall BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32) BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) diff --git a/tools/testing/selftests/x86/run_x86_tests.sh b/tools/testing/selftests/x86/run_x86_tests.sh index 3d3ec65f3e7c..3fc19b376812 100644 --- a/tools/testing/selftests/x86/run_x86_tests.sh +++ b/tools/testing/selftests/x86/run_x86_tests.sh @@ -3,9 +3,11 @@ # This is deliberately minimal. IMO kselftests should provide a standard # script here. ./sigreturn_32 || exit 1 +./single_step_syscall_32 || exit 1 if [[ "$uname -p" -eq "x86_64" ]]; then ./sigreturn_64 || exit 1 + ./single_step_syscall_64 || exit 1 fi exit 0 diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c new file mode 100644 index 000000000000..50c26358e8b7 --- /dev/null +++ b/tools/testing/selftests/x86/single_step_syscall.c @@ -0,0 +1,181 @@ +/* + * single_step_syscall.c - single-steps various x86 syscalls + * Copyright (c) 2014-2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This is a very simple series of tests that makes system calls with + * the TF flag set. This exercises some nasty kernel code in the + * SYSENTER case: SYSENTER does not clear TF, so SYSENTER with TF set + * immediately issues #DB from CPL 0. This requires special handling in + * the kernel. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static volatile sig_atomic_t sig_traps; + +#ifdef __x86_64__ +# define REG_IP REG_RIP +# define WIDTH "q" +#else +# define REG_IP REG_EIP +# define WIDTH "l" +#endif + +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +#define X86_EFLAGS_TF (1UL << 8) + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (get_eflags() & X86_EFLAGS_TF) { + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + printf("[WARN]\tSIGTRAP handler had TF set\n"); + _exit(1); + } + + sig_traps++; + + if (sig_traps == 10000 || sig_traps == 10001) { + printf("[WARN]\tHit %d SIGTRAPs with si_addr 0x%lx, ip 0x%lx\n", + (int)sig_traps, + (unsigned long)info->si_addr, + (unsigned long)ctx->uc_mcontext.gregs[REG_IP]); + } +} + +static void check_result(void) +{ + unsigned long new_eflags = get_eflags(); + set_eflags(new_eflags & ~X86_EFLAGS_TF); + + if (!sig_traps) { + printf("[FAIL]\tNo SIGTRAP\n"); + exit(1); + } + + if (!(new_eflags & X86_EFLAGS_TF)) { + printf("[FAIL]\tTF was cleared\n"); + exit(1); + } + + printf("[OK]\tSurvived with TF set and %d traps\n", (int)sig_traps); + sig_traps = 0; +} + +int main() +{ + int tmp; + + sethandler(SIGTRAP, sigtrap, 0); + + printf("[RUN]\tSet TF and check nop\n"); + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ("nop"); + check_result(); + +#ifdef __x86_64__ + printf("[RUN]\tSet TF and check syscall-less opportunistic sysret\n"); + set_eflags(get_eflags() | X86_EFLAGS_TF); + extern unsigned char post_nop[]; + asm volatile ("pushf" WIDTH "\n\t" + "pop" WIDTH " %%r11\n\t" + "nop\n\t" + "post_nop:" + : : "c" (post_nop) : "r11"); + check_result(); +#endif + + printf("[RUN]\tSet TF and check int80\n"); + set_eflags(get_eflags() | X86_EFLAGS_TF); + asm volatile ("int $0x80" : "=a" (tmp) : "a" (SYS_getpid)); + check_result(); + + /* + * This test is particularly interesting if fast syscalls use + * SYSENTER: it triggers a nasty design flaw in SYSENTER. + * Specifically, SYSENTER does not clear TF, so either SYSENTER + * or the next instruction traps at CPL0. (Of course, Intel + * mostly forgot to document exactly what happens here.) So we + * get a CPL0 fault with usergs (on 64-bit kernels) and possibly + * no stack. The only sane way the kernel can possibly handle + * it is to clear TF on return from the #DB handler, but this + * happens way too early to set TF in the saved pt_regs, so the + * kernel has to do something clever to avoid losing track of + * the TF bit. + * + * Needless to say, we've had bugs in this area. + */ + syscall(SYS_getpid); /* Force symbol binding without TF set. */ + printf("[RUN]\tSet TF and check a fast syscall\n"); + set_eflags(get_eflags() | X86_EFLAGS_TF); + syscall(SYS_getpid); + check_result(); + + /* Now make sure that another fast syscall doesn't set TF again. */ + printf("[RUN]\tFast syscall with TF cleared\n"); + fflush(stdout); /* Force a syscall */ + if (get_eflags() & X86_EFLAGS_TF) { + printf("[FAIL]\tTF is now set\n"); + exit(1); + } + if (sig_traps) { + printf("[FAIL]\tGot SIGTRAP\n"); + exit(1); + } + printf("[OK]\tNothing unexpected happened\n"); + + return 0; +} -- cgit v1.2.1 From fd0f86b66425bd8c6af8985881e82b28c30fd450 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 16 Apr 2015 00:40:25 -0700 Subject: x86/ptrace: Fix the TIF_FORCED_TF logic in handle_signal() When the TIF_SINGLESTEP tracee dequeues a signal, handle_signal() clears TIF_FORCED_TF and X86_EFLAGS_TF but leaves TIF_SINGLESTEP set. If the tracer does PTRACE_SINGLESTEP again, enable_single_step() sets X86_EFLAGS_TF but not TIF_FORCED_TF. This means that the subsequent PTRACE_CONT doesn't not clear X86_EFLAGS_TF, and the tracee gets the wrong SIGTRAP. Test-case (needs -O2 to avoid prologue insns in signal handler): #include #include #include #include #include #include #include void handler(int n) { asm("nop"); } int child(void) { assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0); signal(SIGALRM, handler); kill(getpid(), SIGALRM); return 0x23; } void *getip(int pid) { return (void*)ptrace(PTRACE_PEEKUSER, pid, offsetof(struct user, regs.rip), 0); } int main(void) { int pid, status; pid = fork(); if (!pid) return child(); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGALRM); assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP); assert((getip(pid) - (void*)handler) == 0); assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0); assert(wait(&status) == pid); assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP); assert((getip(pid) - (void*)handler) == 1); assert(ptrace(PTRACE_CONT, pid, 0,0) == 0); assert(wait(&status) == pid); assert(WIFEXITED(status) && WEXITSTATUS(status) == 0x23); return 0; } The last assert() fails because PTRACE_CONT wrongly triggers another single-step and X86_EFLAGS_TF can't be cleared by debugger until the tracee does sys_rt_sigreturn(). Change handle_signal() to do user_disable_single_step() if stepping, we do not need to preserve TIF_SINGLESTEP because we are going to do ptrace_notify(), and it is simply wrong to leak this bit. While at it, change the comment to explain why we also need to clear TF unconditionally after setup_rt_frame(). Note: in the longer term we should probably change setup_sigcontext() to use get_flags() and then just remove this user_disable_single_step(). And, the state of TIF_FORCED_TF can be wrong after restore_sigcontext() which can set/clear TF, this needs another fix. This fix fixes the 'single_step_syscall_32' testcase in the x86 testsuite: Before: ~/linux/tools/testing/selftests/x86> ./single_step_syscall_32 [RUN] Set TF and check nop [OK] Survived with TF set and 9 traps [RUN] Set TF and check int80 [OK] Survived with TF set and 9 traps [RUN] Set TF and check a fast syscall [WARN] Hit 10000 SIGTRAPs with si_addr 0xf7789cc0, ip 0xf7789cc0 Trace/breakpoint trap (core dumped) After: ~/linux/linux/tools/testing/selftests/x86> ./single_step_syscall_32 [RUN] Set TF and check nop [OK] Survived with TF set and 9 traps [RUN] Set TF and check int80 [OK] Survived with TF set and 9 traps [RUN] Set TF and check a fast syscall [OK] Survived with TF set and 39 traps [RUN] Fast syscall with TF cleared [OK] Nothing unexpected happened Reported-by: Evan Teran Reported-by: Pedro Alves Tested-by: Andres Freund Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner [ Added x86 self-test info. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 3e581865c8e2..d185bdd95a4b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -630,7 +630,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - bool failed; + bool stepping, failed; + /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -654,12 +655,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) } /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now + * so that register information in the sigcontext is correct and + * then notify the tracer before entering the signal handler. */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; + stepping = test_thread_flag(TIF_SINGLESTEP); + if (stepping) + user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { @@ -670,10 +672,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * it might disable possible debug exception from the * signal handler. * - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. + * Clear TF for the case when it wasn't set by debugger to + * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* @@ -682,7 +682,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) if (used_math()) fpu_reset_state(current); } - signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); + signal_setup_done(failed, ksig, stepping); } #ifdef CONFIG_X86_32 -- cgit v1.2.1 From 18ecb3bfa5a9f6fffbb3eeb4369f0b9463438ec0 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Apr 2015 20:41:37 +0200 Subject: x86/fpu: Load xsave pointer *after* initialization So I was playing with gdb today and did this simple thing: gdb /bin/ls ... (gdb) run Box exploded with this splat: BUG: unable to handle kernel NULL pointer dereference at 00000000000001d0 IP: [] xstateregs_get+0x7a/0x120 [...] Call Trace: ptrace_regset ptrace_request ? wait_task_inactive ? preempt_count_sub arch_ptrace ? ptrace_get_task_struct SyS_ptrace system_call_fastpath ... because we do cache &target->thread.fpu.state->xsave into the local variable xsave but that pointer is NULL at that time and it gets initialized later, in init_fpu(), see: e7f180dcd8ab ("x86/fpu: Change xstateregs_get()/set() to use ->xsave.i387 rather than ->fxsave") The fix is simple: load xsave *after* init_fpu() has run. Also do the same in xstateregs_set(), as suggested by Oleg Nesterov. Signed-off-by: Borislav Petkov Acked-by: Oleg Nesterov Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Rik van Riel Cc: Tavis Ormandy Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1429209697-5902-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 367f39d35e9c..009183276bb7 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -341,7 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -351,6 +351,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + /* * Copy the 48bytes defined by the software first into the xstate * memory layout in the thread struct, so that we can copy the entire @@ -369,7 +371,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -379,6 +381,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. -- cgit v1.2.1 From a6dfa128ce5c414ab46b1d690f7a1b8decb8526d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 17 Apr 2015 15:04:48 -0400 Subject: config: Enable NEED_DMA_MAP_STATE by default when SWIOTLB is selected A huge amount of NIC drivers use the DMA API, however if compiled under 32-bit an very important part of the DMA API can be ommitted leading to the drivers not working at all (especially if used with 'swiotlb=force iommu=soft'). As Prashant Sreedharan explains it: "the driver [tg3] uses DEFINE_DMA_UNMAP_ADDR(), dma_unmap_addr_set() to keep a copy of the dma "mapping" and dma_unmap_addr() to get the "mapping" value. On most of the platforms this is a no-op, but ... with "iommu=soft and swiotlb=force" this house keeping is required, ... otherwise we pass 0 while calling pci_unmap_/pci_dma_sync_ instead of the DMA address." As such enable this even when using 32-bit kernels. Reported-by: Ian Jackson Signed-off-by: Konrad Rzeszutek Wilk Acked-by: David S. Miller Acked-by: Prashant Sreedharan Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Michael Chan Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: cascardo@linux.vnet.ibm.com Cc: david.vrabel@citrix.com Cc: sanjeevb@broadcom.com Cc: siva.kallam@broadcom.com Cc: vyasevich@gmail.com Cc: xen-devel@lists.xensource.com Link: http://lkml.kernel.org/r/20150417190448.GA9462@l.oracle.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index faff6934c05a..eb79036e3503 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -177,7 +177,7 @@ config SBUS config NEED_DMA_MAP_STATE def_bool y - depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG + depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB config NEED_SG_DMA_LENGTH def_bool y -- cgit v1.2.1