diff options
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/kernel/entry_64.S | 73 | ||||
-rw-r--r-- | arch/x86/kernel/espfix_64.c | 208 | ||||
-rw-r--r-- | arch/x86/kernel/ldt.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 7 |
5 files changed, 285 insertions, 15 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d96000d33a..1cc3789d99d9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..bffaa986cafc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -58,6 +58,7 @@ #include <asm/asm.h> #include <asm/context_tracking.h> #include <asm/smap.h> +#include <asm/pgtable_types.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -1040,8 +1041,16 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ + testb $4,(SS-RIP)(%rsp) + jnz irq_return_ldt + +irq_return_iret: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) + _ASM_EXTABLE(irq_return_iret, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) @@ -1049,6 +1058,30 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp irq_return_iret + .section .fixup,"ax" bad_iret: /* @@ -1110,9 +1143,41 @@ ENTRY(retint_kernel) call preempt_schedule_irq jmp exit_intr #endif - CFI_ENDPROC END(common_interrupt) + + /* + * If IRET takes a fault on the espfix stack, then we + * end up promoting it to a doublefault. In that case, + * modify the stack to make it look like we just entered + * the #GP handler from user space, similar to bad_iret. + */ + ALIGN +__do_double_fault: + XCPT_FRAME 1 RDI+8 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ + sarq $PGDIR_SHIFT,%rax + cmpl $ESPFIX_PGD_ENTRY,%eax + jne do_double_fault /* No, just deliver the fault */ + cmpl $__KERNEL_CS,CS(%rdi) + jne do_double_fault + movq RIP(%rdi),%rax + cmpq $irq_return_iret,%rax +#ifdef CONFIG_PARAVIRT + je 1f + cmpq $native_iret,%rax +#endif + jne do_double_fault /* This shouldn't happen... */ +1: + movq PER_CPU_VAR(kernel_stack),%rax + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ + movq %rax,RSP(%rdi) + movq $0,(%rax) /* Missing (lost) #GP error code */ + movq $general_protection,RIP(%rdi) + retq + CFI_ENDPROC +END(__do_double_fault) + /* * End of kprobes section */ @@ -1314,7 +1379,7 @@ zeroentry overflow do_overflow zeroentry bounds do_bounds zeroentry invalid_op do_invalid_op zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault +paranoiderrorentry double_fault __do_double_fault zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun errorentry invalid_TSS do_invalid_TSS errorentry segment_not_present do_segment_not_present @@ -1601,7 +1666,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return(%rip),%rcx + leaq irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000000..8a64da36310f --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,208 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index af1d14a9ebda..ebc987398923 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -229,17 +229,6 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } - /* - * On x86-64 we do not support 16-bit segments due to - * IRET leaking the high bits of the kernel stack address. - */ -#ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit) { - error = -EINVAL; - goto out_unlock; - } -#endif - fill_ldt(&ldt, &ldt_info); if (oldmode) ldt.avl = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34826934d4a7..61a5350850fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_64 + init_espfix_ap(); +#endif + + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. |