diff options
Diffstat (limited to 'arch')
54 files changed, 394 insertions, 185 deletions
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 6984342da13d..61d96a645ff3 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -98,6 +98,11 @@ static void kvm_flush_dcache_pud(pud_t pud) __kvm_flush_dcache_pud(pud); } +static bool kvm_is_device_pfn(unsigned long pfn) +{ + return !pfn_valid(pfn); +} + /** * stage2_dissolve_pmd() - clear and flush huge PMD entry * @kvm: pointer to kvm structure. @@ -213,7 +218,7 @@ static void unmap_ptes(struct kvm *kvm, pmd_t *pmd, kvm_tlb_flush_vmid_ipa(kvm, addr); /* No need to invalidate the cache for device mappings */ - if ((pte_val(old_pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + if (!kvm_is_device_pfn(pte_pfn(old_pte))) kvm_flush_dcache_pte(old_pte); put_page(virt_to_page(pte)); @@ -305,8 +310,7 @@ static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd, pte = pte_offset_kernel(pmd, addr); do { - if (!pte_none(*pte) && - (pte_val(*pte) & PAGE_S2_DEVICE) != PAGE_S2_DEVICE) + if (!pte_none(*pte) && !kvm_is_device_pfn(pte_pfn(*pte))) kvm_flush_dcache_pte(*pte); } while (pte++, addr += PAGE_SIZE, addr != end); } @@ -1037,11 +1041,6 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu) return kvm_vcpu_dabt_iswrite(vcpu); } -static bool kvm_is_device_pfn(unsigned long pfn) -{ - return !pfn_valid(pfn); -} - /** * stage2_wp_ptes - write protect PMD range * @pmd: pointer to pmd entry diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index b8efb8cd1f73..4d25fd0fae10 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -182,19 +182,6 @@ static inline int mem_words_used(struct jit_ctx *ctx) return fls(ctx->seen & SEEN_MEM); } -static inline bool is_load_to_a(u16 inst) -{ - switch (inst) { - case BPF_LD | BPF_W | BPF_LEN: - case BPF_LD | BPF_W | BPF_ABS: - case BPF_LD | BPF_H | BPF_ABS: - case BPF_LD | BPF_B | BPF_ABS: - return true; - default: - return false; - } -} - static void jit_fill_hole(void *area, unsigned int size) { u32 *ptr; @@ -206,7 +193,6 @@ static void jit_fill_hole(void *area, unsigned int size) static void build_prologue(struct jit_ctx *ctx) { u16 reg_set = saved_regs(ctx); - u16 first_inst = ctx->skf->insns[0].code; u16 off; #ifdef CONFIG_FRAME_POINTER @@ -236,7 +222,7 @@ static void build_prologue(struct jit_ctx *ctx) emit(ARM_MOV_I(r_X, 0), ctx); /* do not leak kernel data to userspace */ - if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst))) + if (bpf_needs_clear_a(&ctx->skf->insns[0])) emit(ARM_MOV_I(r_A, 0), ctx); /* stack space for the BPF_MEM words */ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 07d1811aa03f..a92266e634cd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -311,6 +311,27 @@ config ARM64_ERRATUM_832075 If unsure, say Y. +config ARM64_ERRATUM_834220 + bool "Cortex-A57: 834220: Stage 2 translation fault might be incorrectly reported in presence of a Stage 1 fault" + depends on KVM + default y + help + This option adds an alternative code sequence to work around ARM + erratum 834220 on Cortex-A57 parts up to r1p2. + + Affected Cortex-A57 parts might report a Stage 2 translation + fault as a the result of a Stage 1 fault for a load crossing + a page boundary when there is a Stage 1 permission or device + memory alignment fault and a Stage 2 translation fault + + The workaround is to verify that the Stage-1 translation + doesn't generate a fault before handling the Stage-2 fault. + Please note that this does not necessarily enable the workaround, + as it depends on the alternative framework, which will only patch + the kernel if an affected CPU is detected. + + If unsure, say Y. + config ARM64_ERRATUM_845719 bool "Cortex-A53: 845719: a load might read incorrect data" depends on COMPAT diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h index b3b5c4ae3800..af5b9d5c5c23 100644 --- a/arch/arm64/include/asm/atomic_ll_sc.h +++ b/arch/arm64/include/asm/atomic_ll_sc.h @@ -211,7 +211,7 @@ __CMPXCHG_CASE( , , mb_8, dmb ish, l, "memory") #undef __CMPXCHG_CASE #define __CMPXCHG_DBL(name, mb, rel, cl) \ -__LL_SC_INLINE int \ +__LL_SC_INLINE long \ __LL_SC_PREFIX(__cmpxchg_double##name(unsigned long old1, \ unsigned long old2, \ unsigned long new1, \ diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 55d740e63459..4d548aa54b21 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -348,7 +348,7 @@ __CMPXCHG_CASE(x, , mb_8, al, "memory") #define __LL_SC_CMPXCHG_DBL(op) __LL_SC_CALL(__cmpxchg_double##op) #define __CMPXCHG_DBL(name, mb, cl...) \ -static inline int __cmpxchg_double##name(unsigned long old1, \ +static inline long __cmpxchg_double##name(unsigned long old1, \ unsigned long old2, \ unsigned long new1, \ unsigned long new2, \ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 171570702bb8..a1a5981526fe 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -27,8 +27,9 @@ #define ARM64_HAS_SYSREG_GIC_CPUIF 3 #define ARM64_HAS_PAN 4 #define ARM64_HAS_LSE_ATOMICS 5 +#define ARM64_WORKAROUND_834220 6 -#define ARM64_NCAPS 6 +#define ARM64_NCAPS 7 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 17e92f05b1fe..3ca894ecf699 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -99,11 +99,13 @@ static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) |= COMPAT_PSR_T_BIT; } +/* + * vcpu_reg should always be passed a register number coming from a + * read of ESR_EL2. Otherwise, it may give the wrong result on AArch32 + * with banked registers. + */ static inline unsigned long *vcpu_reg(const struct kvm_vcpu *vcpu, u8 reg_num) { - if (vcpu_mode_is_32bit(vcpu)) - return vcpu_reg32(vcpu, reg_num); - return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.regs[reg_num]; } diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 6ffd91438560..dc0df822def3 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -74,6 +74,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { (1 << MIDR_VARIANT_SHIFT) | 2), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_834220 + { + /* Cortex-A57 r0p0 - r1p2 */ + .desc = "ARM erratum 834220", + .capability = ARM64_WORKAROUND_834220, + MIDR_RANGE(MIDR_CORTEX_A57, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 2), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_845719 { /* Cortex-A53 r0p[01234] */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 90d09eddd5b2..b84ef8376471 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -524,9 +524,14 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems #endif /* EL2 debug */ + mrs x0, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx x0, x0, #8, #4 + cmp x0, #1 + b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps ubfx x0, x0, #11, #5 // to EL2 and allow access to msr mdcr_el2, x0 // all PMU counters from EL1 +4: /* Stage-2 translation */ msr vttbr_el2, xzr diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index f9a74d4fff3b..f325af5b38e3 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1159,9 +1159,6 @@ static void armv8pmu_reset(void *info) /* Initialize & Reset PMNC: C and P bits. */ armv8pmu_pmcr_write(ARMV8_PMCR_P | ARMV8_PMCR_C); - - /* Disable access from userspace. */ - asm volatile("msr pmuserenr_el0, %0" :: "r" (0)); } static int armv8_pmuv3_map_event(struct perf_event *event) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 1971f491bb90..ff7f13239515 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -58,6 +58,12 @@ */ void ptrace_disable(struct task_struct *child) { + /* + * This would be better off in core code, but PTRACE_DETACH has + * grown its fair share of arch-specific worts and changing it + * is likely to cause regressions on obscure architectures. + */ + user_disable_single_step(child); } #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 232247945b1c..9f17dc72645a 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -558,6 +558,10 @@ static int c_show(struct seq_file *m, void *v) */ seq_printf(m, "processor\t: %d\n", i); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); + /* * Dump out the common processor features in a single line. * Userspace should read the hwcaps with getauxval(AT_HWCAP) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 44ca4143b013..dd6ad81d53aa 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -1,3 +1,4 @@ +#include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/slab.h> #include <asm/cacheflush.h> @@ -71,6 +72,13 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) local_dbg_save(flags); /* + * Function graph tracer state gets incosistent when the kernel + * calls functions that never return (aka suspend finishers) hence + * disable graph tracing during their execution. + */ + pause_graph_tracing(); + + /* * mm context saved on the stack, it will be restored when * the cpu comes out of reset through the identity mapped * page tables, so that the thread address space is properly @@ -111,6 +119,8 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) hw_breakpoint_restore(NULL); } + unpause_graph_tracing(); + /* * Restore pstate flags. OS lock and mdscr have been already * restored, so from this point onwards, debugging is fully diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index e5836138ec42..3e840649b133 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -1007,9 +1007,15 @@ el1_trap: b.ne 1f // Not an abort we care about /* This is an abort. Check for permission fault */ +alternative_if_not ARM64_WORKAROUND_834220 and x2, x1, #ESR_ELx_FSC_TYPE cmp x2, #FSC_PERM b.ne 1f // Not a permission fault +alternative_else + nop // Force a Stage-1 translation to occur + nop // and return to the guest if it failed + nop +alternative_endif /* * Check for Stage-1 page table walk, which is guaranteed diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 85c57158dcd9..648112e90ed5 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -48,7 +48,7 @@ static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) /* Note: These now point to the banked copies */ *vcpu_spsr(vcpu) = new_spsr_value; - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; /* Branch to exception vector */ if (sctlr & (1 << 13)) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9211b8527f25..9317974c9b8e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -451,6 +451,9 @@ void __init paging_init(void) empty_zero_page = virt_to_page(zero_page); + /* Ensure the zero page is visible to the page table walker */ + dsb(ishst); + /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index 4c4d93c4bf65..d69dffffaa89 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -62,3 +62,15 @@ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH #endif .endm + +/* + * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present + */ + .macro reset_pmuserenr_el0, tmpreg + mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer + sbfx \tmpreg, \tmpreg, #8, #4 + cmp \tmpreg, #1 // Skip if no PMU present + b.lt 9000f + msr pmuserenr_el0, xzr // Disable PMU access from EL0 +9000: + .endm diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index e4ee7bd8830a..b722d3e26185 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -115,6 +115,7 @@ ENTRY(cpu_do_resume) */ ubfx x11, x11, #1, #1 msr oslar_el1, x11 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 mov x0, x12 dsb nsh // Make sure local tlb invalidation completed isb @@ -153,6 +154,7 @@ ENTRY(__cpu_setup) msr cpacr_el1, x0 // Enable FP/ASIMD mov x0, #1 << 12 // Reset mdscr_el1 and disable msr mdscr_el1, x0 // access to the DCC from EL0 + reset_pmuserenr_el0 x0 // Disable PMU access from EL0 /* * Memory region attributes for LPAE: * diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index 98a26ce82d26..aee5637ea436 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -1,7 +1,7 @@ /* * BPF JIT compiler for ARM64 * - * Copyright (C) 2014 Zi Shen Lim <zlim.lnx@gmail.com> + * Copyright (C) 2014-2015 Zi Shen Lim <zlim.lnx@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -35,6 +35,7 @@ aarch64_insn_gen_comp_branch_imm(0, offset, Rt, A64_VARIANT(sf), \ AARCH64_INSN_BRANCH_COMP_##type) #define A64_CBZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, ZERO) +#define A64_CBNZ(sf, Rt, imm19) A64_COMP_BRANCH(sf, Rt, (imm19) << 2, NONZERO) /* Conditional branch (immediate) */ #define A64_COND_BRANCH(cond, offset) \ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c047598b09e0..6217f80702d2 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1,7 +1,7 @@ /* * BPF JIT compiler for ARM64 * - * Copyright (C) 2014 Zi Shen Lim <zlim.lnx@gmail.com> + * Copyright (C) 2014-2015 Zi Shen Lim <zlim.lnx@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -225,6 +225,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) u8 jmp_cond; s32 jmp_offset; +#define check_imm(bits, imm) do { \ + if ((((imm) > 0) && ((imm) >> (bits))) || \ + (((imm) < 0) && (~(imm) >> (bits)))) { \ + pr_info("[%2d] imm=%d(0x%x) out of range\n", \ + i, imm, imm); \ + return -EINVAL; \ + } \ +} while (0) +#define check_imm19(imm) check_imm(19, imm) +#define check_imm26(imm) check_imm(26, imm) + switch (code) { /* dst = src */ case BPF_ALU | BPF_MOV | BPF_X: @@ -258,15 +269,33 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU64 | BPF_DIV | BPF_X: - emit(A64_UDIV(is64, dst, dst, src), ctx); - break; case BPF_ALU | BPF_MOD | BPF_X: case BPF_ALU64 | BPF_MOD | BPF_X: - ctx->tmp_used = 1; - emit(A64_UDIV(is64, tmp, dst, src), ctx); - emit(A64_MUL(is64, tmp, tmp, src), ctx); - emit(A64_SUB(is64, dst, dst, tmp), ctx); + { + const u8 r0 = bpf2a64[BPF_REG_0]; + + /* if (src == 0) return 0 */ + jmp_offset = 3; /* skip ahead to else path */ + check_imm19(jmp_offset); + emit(A64_CBNZ(is64, src, jmp_offset), ctx); + emit(A64_MOVZ(1, r0, 0, 0), ctx); + jmp_offset = epilogue_offset(ctx); + check_imm26(jmp_offset); + emit(A64_B(jmp_offset), ctx); + /* else */ + switch (BPF_OP(code)) { + case BPF_DIV: + emit(A64_UDIV(is64, dst, dst, src), ctx); + break; + case BPF_MOD: + ctx->tmp_used = 1; + emit(A64_UDIV(is64, tmp, dst, src), ctx); + emit(A64_MUL(is64, tmp, tmp, src), ctx); + emit(A64_SUB(is64, dst, dst, tmp), ctx); + break; + } break; + } case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU64 | BPF_LSH | BPF_X: emit(A64_LSLV(is64, dst, dst, src), ctx); @@ -393,17 +422,6 @@ emit_bswap_uxt: emit(A64_ASR(is64, dst, dst, imm), ctx); break; -#define check_imm(bits, imm) do { \ - if ((((imm) > 0) && ((imm) >> (bits))) || \ - (((imm) < 0) && (~(imm) >> (bits)))) { \ - pr_info("[%2d] imm=%d(0x%x) out of range\n", \ - i, imm, imm); \ - return -EINVAL; \ - } \ -} while (0) -#define check_imm19(imm) check_imm(19, imm) -#define check_imm26(imm) check_imm(26, imm) - /* JUMP off */ case BPF_JMP | BPF_JA: jmp_offset = bpf2a64_offset(i + off, i, ctx); diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 0c4a133f6216..26e947d61040 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -521,19 +521,6 @@ static inline u16 align_sp(unsigned int num) return num; } -static bool is_load_to_a(u16 inst) -{ - switch (inst) { - case BPF_LD | BPF_W | BPF_LEN: - case BPF_LD | BPF_W | BPF_ABS: - case BPF_LD | BPF_H | BPF_ABS: - case BPF_LD | BPF_B | BPF_ABS: - return true; - default: - return false; - } -} - static void save_bpf_jit_regs(struct jit_ctx *ctx, unsigned offset) { int i = 0, real_off = 0; @@ -614,7 +601,6 @@ static unsigned int get_stack_depth(struct jit_ctx *ctx) static void build_prologue(struct jit_ctx *ctx) { - u16 first_inst = ctx->skf->insns[0].code; int sp_off; /* Calculate the total offset for the stack pointer */ @@ -641,7 +627,7 @@ static void build_prologue(struct jit_ctx *ctx) emit_jit_reg_move(r_X, r_zero, ctx); /* Do not leak kernel data to userspace */ - if ((first_inst != (BPF_RET | BPF_K)) && !(is_load_to_a(first_inst))) + if (bpf_needs_clear_a(&ctx->skf->insns[0])) emit_jit_reg_move(r_A, r_zero, ctx); } diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 4434b54e1d87..78ae5552fdb8 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -1,6 +1,7 @@ config MN10300 def_bool y select HAVE_OPROFILE + select HAVE_UID16 select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select HAVE_ARCH_TRACEHOOK @@ -37,9 +38,6 @@ config HIGHMEM config NUMA def_bool n -config UID16 - def_bool y - config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index ad6263cffb0f..d1a8d93cccfd 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -18,12 +18,12 @@ __xchg_u32(volatile void *p, unsigned long val) unsigned long prev; __asm__ __volatile__( - PPC_RELEASE_BARRIER + PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %0,0,%2 \n" PPC405_ERR77(0,%2) " stwcx. %3,0,%2 \n\ bne- 1b" - PPC_ACQUIRE_BARRIER + PPC_ATOMIC_EXIT_BARRIER : "=&r" (prev), "+m" (*(volatile unsigned int *)p) : "r" (p), "r" (val) : "cc", "memory"); @@ -61,12 +61,12 @@ __xchg_u64(volatile void *p, unsigned long val) unsigned long prev; __asm__ __volatile__( - PPC_RELEASE_BARRIER + PPC_ATOMIC_ENTRY_BARRIER "1: ldarx %0,0,%2 \n" PPC405_ERR77(0,%2) " stdcx. %3,0,%2 \n\ bne- 1b" - PPC_ACQUIRE_BARRIER + PPC_ATOMIC_EXIT_BARRIER : "=&r" (prev), "+m" (*(volatile unsigned long *)p) : "r" (p), "r" (val) : "cc", "memory"); @@ -151,14 +151,14 @@ __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) unsigned int prev; __asm__ __volatile__ ( - PPC_RELEASE_BARRIER + PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %0,0,%2 # __cmpxchg_u32\n\ cmpw 0,%0,%3\n\ bne- 2f\n" PPC405_ERR77(0,%2) " stwcx. %4,0,%2\n\ bne- 1b" - PPC_ACQUIRE_BARRIER + PPC_ATOMIC_EXIT_BARRIER "\n\ 2:" : "=&r" (prev), "+m" (*p) @@ -197,13 +197,13 @@ __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new) unsigned long prev; __asm__ __volatile__ ( - PPC_RELEASE_BARRIER + PPC_ATOMIC_ENTRY_BARRIER "1: ldarx %0,0,%2 # __cmpxchg_u64\n\ cmpd 0,%0,%3\n\ bne- 2f\n\ stdcx. %4,0,%2\n\ bne- 1b" - PPC_ACQUIRE_BARRIER + PPC_ATOMIC_EXIT_BARRIER "\n\ 2:" : "=&r" (prev), "+m" (*p) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a908ada8e0a5..2220f7a60def 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -108,6 +108,7 @@ #define MSR_TS_T __MASK(MSR_TS_T_LG) /* Transaction Transactional */ #define MSR_TS_MASK (MSR_TS_T | MSR_TS_S) /* Transaction State bits */ #define MSR_TM_ACTIVE(x) (((x) & MSR_TS_MASK) != 0) /* Transaction active? */ +#define MSR_TM_RESV(x) (((x) & MSR_TS_MASK) == MSR_TS_MASK) /* Reserved */ #define MSR_TM_TRANSACTIONAL(x) (((x) & MSR_TS_MASK) == MSR_TS_T) #define MSR_TM_SUSPENDED(x) (((x) & MSR_TS_MASK) == MSR_TS_S) diff --git a/arch/powerpc/include/asm/synch.h b/arch/powerpc/include/asm/synch.h index e682a7143edb..c50868681f9e 100644 --- a/arch/powerpc/include/asm/synch.h +++ b/arch/powerpc/include/asm/synch.h @@ -44,7 +44,7 @@ static inline void isync(void) MAKE_LWSYNC_SECTION_ENTRY(97, __lwsync_fixup); #define PPC_ACQUIRE_BARRIER "\n" stringify_in_c(__PPC_ACQUIRE_BARRIER) #define PPC_RELEASE_BARRIER stringify_in_c(LWSYNC) "\n" -#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(LWSYNC) "\n" +#define PPC_ATOMIC_ENTRY_BARRIER "\n" stringify_in_c(sync) "\n" #define PPC_ATOMIC_EXIT_BARRIER "\n" stringify_in_c(sync) "\n" #else #define PPC_ACQUIRE_BARRIER diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index 59dad113897b..c2d21d11c2d2 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -295,6 +295,8 @@ do { \ #define R_PPC64_TLSLD 108 #define R_PPC64_TOCSAVE 109 +#define R_PPC64_ENTRY 118 + #define R_PPC64_REL16 249 #define R_PPC64_REL16_LO 250 #define R_PPC64_REL16_HI 251 diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 68384514506b..59663af9315f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -635,6 +635,33 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, */ break; + case R_PPC64_ENTRY: + /* + * Optimize ELFv2 large code model entry point if + * the TOC is within 2GB range of current location. + */ + value = my_r2(sechdrs, me) - (unsigned long)location; + if (value + 0x80008000 > 0xffffffff) + break; + /* + * Check for the large code model prolog sequence: + * ld r2, ...(r12) + * add r2, r2, r12 + */ + if ((((uint32_t *)location)[0] & ~0xfffc) + != 0xe84c0000) + break; + if (((uint32_t *)location)[1] != 0x7c426214) + break; + /* + * If found, replace it with: + * addis r2, r12, (.TOC.-func)@ha + * addi r2, r12, (.TOC.-func)@l + */ + ((uint32_t *)location)[0] = 0x3c4c0000 + PPC_HA(value); + ((uint32_t *)location)[1] = 0x38420000 + PPC_LO(value); + break; + case R_PPC64_REL16_HA: /* Subtract location pointer */ value -= (unsigned long)location; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 75b6676c1a0b..646bf4d222c1 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -551,6 +551,24 @@ static void tm_reclaim_thread(struct thread_struct *thr, msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1; } + /* + * Use the current MSR TM suspended bit to track if we have + * checkpointed state outstanding. + * On signal delivery, we'd normally reclaim the checkpointed + * state to obtain stack pointer (see:get_tm_stackpointer()). + * This will then directly return to userspace without going + * through __switch_to(). However, if the stack frame is bad, + * we need to exit this thread which calls __switch_to() which + * will again attempt to reclaim the already saved tm state. + * Hence we need to check that we've not already reclaimed + * this state. + * We do this using the current MSR, rather tracking it in + * some specific thread_struct bit, as it has the additional + * benifit of checking for a potential TM bad thing exception. + */ + if (!MSR_TM_SUSPENDED(mfmsr())) + return; + tm_reclaim(thr, thr->regs->msr, cause); /* Having done the reclaim, we now have the checkpointed diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 0dbee465af7a..ef7c24e84a62 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -875,6 +875,15 @@ static long restore_tm_user_regs(struct pt_regs *regs, return 1; #endif /* CONFIG_SPE */ + /* Get the top half of the MSR from the user context */ + if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + return 1; + msr_hi <<= 32; + /* If TM bits are set to the reserved value, it's an invalid context */ + if (MSR_TM_RESV(msr_hi)) + return 1; + /* Pull in the MSR TM bits from the user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr_hi & MSR_TS_MASK); /* Now, recheckpoint. This loads up all of the checkpointed (older) * registers, including FP and V[S]Rs. After recheckpointing, the * transactional versions should be loaded. @@ -884,11 +893,6 @@ static long restore_tm_user_regs(struct pt_regs *regs, current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(¤t->thread, msr); - /* Get the top half of the MSR */ - if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) - return 1; - /* Pull in MSR TM from user context */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK); /* This loads the speculative FP/VEC state, if used */ if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 20756dfb9f34..c676ecec0869 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -438,6 +438,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs, /* get MSR separately, transfer the LE bit if doing signal return */ err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + /* Don't allow reserved mode. */ + if (MSR_TM_RESV(msr)) + return -EINVAL; + /* pull in MSR TM from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9c26c5a96ea2..a7352b59e6f9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -224,6 +224,12 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { + /* + * Check for illegal transactional state bit combination + * and if we find it, force the TS field to a safe state. + */ + if ((msr & MSR_TS_MASK) == MSR_TS_MASK) + msr &= ~MSR_TS_MASK; vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } @@ -2019,7 +2025,7 @@ static bool can_split_piggybacked_subcores(struct core_info *cip) return false; n_subcores += (cip->subcore_threads[sub] - 1) >> 1; } - if (n_subcores > 3 || large_sub < 0) + if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2)) return false; /* diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 17cea18a09d3..264c473c1b3c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -78,18 +78,9 @@ static void bpf_jit_build_prologue(struct bpf_prog *fp, u32 *image, PPC_LI(r_X, 0); } - switch (filter[0].code) { - case BPF_RET | BPF_K: - case BPF_LD | BPF_W | BPF_LEN: - case BPF_LD | BPF_W | BPF_ABS: - case BPF_LD | BPF_H | BPF_ABS: - case BPF_LD | BPF_B | BPF_ABS: - /* first instruction sets A register (or is RET 'constant') */ - break; - default: - /* make sure we dont leak kernel information to user */ + /* make sure we dont leak kernel information to user */ + if (bpf_needs_clear_a(&filter[0])) PPC_LI(r_A, 0); - } } static void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 2c91ee7800b9..e96027d27151 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -43,11 +43,34 @@ static unsigned int opal_irq_count; static unsigned int *opal_irqs; static void opal_handle_irq_work(struct irq_work *work); -static __be64 last_outstanding_events; +static u64 last_outstanding_events; static struct irq_work opal_event_irq_work = { .func = opal_handle_irq_work, }; +void opal_handle_events(uint64_t events) +{ + int virq, hwirq = 0; + u64 mask = opal_event_irqchip.mask; + + if (!in_irq() && (events & mask)) { + last_outstanding_events = events; + irq_work_queue(&opal_event_irq_work); + return; + } + + while (events & mask) { + hwirq = fls64(events) - 1; + if (BIT_ULL(hwirq) & mask) { + virq = irq_find_mapping(opal_event_irqchip.domain, + hwirq); + if (virq) + generic_handle_irq(virq); + } + events &= ~BIT_ULL(hwirq); + } +} + static void opal_event_mask(struct irq_data *d) { clear_bit(d->hwirq, &opal_event_irqchip.mask); @@ -55,9 +78,21 @@ static void opal_event_mask(struct irq_data *d) static void opal_event_unmask(struct irq_data *d) { + __be64 events; + set_bit(d->hwirq, &opal_event_irqchip.mask); - opal_poll_events(&last_outstanding_events); + opal_poll_events(&events); + last_outstanding_events = be64_to_cpu(events); + + /* + * We can't just handle the events now with opal_handle_events(). + * If we did we would deadlock when opal_event_unmask() is called from + * handle_level_irq() with the irq descriptor lock held, because + * calling opal_handle_events() would call generic_handle_irq() and + * then handle_level_irq() which would try to take the descriptor lock + * again. Instead queue the events for later. + */ if (last_outstanding_events & opal_event_irqchip.mask) /* Need to retrigger the interrupt */ irq_work_queue(&opal_event_irq_work); @@ -96,29 +131,6 @@ static int opal_event_map(struct irq_domain *d, unsigned int irq, return 0; } -void opal_handle_events(uint64_t events) -{ - int virq, hwirq = 0; - u64 mask = opal_event_irqchip.mask; - - if (!in_irq() && (events & mask)) { - last_outstanding_events = events; - irq_work_queue(&opal_event_irq_work); - return; - } - - while (events & mask) { - hwirq = fls64(events) - 1; - if (BIT_ULL(hwirq) & mask) { - virq = irq_find_mapping(opal_event_irqchip.domain, - hwirq); - if (virq) - generic_handle_irq(virq); - } - events &= ~BIT_ULL(hwirq); - } -} - static irqreturn_t opal_interrupt(int irq, void *data) { __be64 events; @@ -131,7 +143,7 @@ static irqreturn_t opal_interrupt(int irq, void *data) static void opal_handle_irq_work(struct irq_work *work) { - opal_handle_events(be64_to_cpu(last_outstanding_events)); + opal_handle_events(last_outstanding_events); } static int opal_event_match(struct irq_domain *h, struct device_node *node, diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 4296d55e88f3..57cffb80bc36 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -278,7 +278,7 @@ static void opal_handle_message(void) /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { - pr_warning("%s: Unknown message type: %u\n", __func__, type); + pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } opal_message_do_notify(type, (void *)&msg); diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f8b9f71b9a2b..17e71d2d96e5 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -420,22 +420,9 @@ void bpf_jit_compile(struct bpf_prog *fp) } emit_reg_move(O7, r_saved_O7); - switch (filter[0].code) { - case BPF_RET | BPF_K: - case BPF_LD | BPF_W | BPF_LEN: - case BPF_LD | BPF_W | BPF_ABS: - case BPF_LD | BPF_H | BPF_ABS: - case BPF_LD | BPF_B | BPF_ABS: - /* The first instruction sets the A register (or is - * a "RET 'constant'") - */ - break; - default: - /* Make sure we dont leak kernel information to the - * user. - */ + /* Make sure we dont leak kernel information to the user. */ + if (bpf_needs_clear_a(&filter[0])) emit_clear(r_A); /* A = 0 */ - } for (i = 0; i < flen; i++) { unsigned int K = filter[i].k; diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 4fa687a47a62..6b8d6e8cd449 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -27,7 +27,7 @@ #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ -#define BOOT_HEAP_SIZE 0x8000 +#define BOOT_HEAP_SIZE 0x10000 #endif /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 379cd3658799..bfd9b2a35a0b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif cpumask_set_cpu(cpu, mm_cpumask(next)); - /* Re-load page tables */ + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ @@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); + /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 10d0596433f8..c759b3cca663 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,6 +19,12 @@ static inline int paravirt_enabled(void) return pv_info.paravirt_enabled; } +static inline int paravirt_has_feature(unsigned int feature) +{ + WARN_ON_ONCE(!pv_info.paravirt_enabled); + return (pv_info.features & feature); +} + static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5bff7c..3d44191185f8 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -70,9 +70,14 @@ struct pv_info { #endif int paravirt_enabled; + unsigned int features; /* valid only if paravirt_enabled is set */ const char *name; }; +#define paravirt_has(x) paravirt_has_feature(PV_SUPPORTED_##x) +/* Supported features */ +#define PV_SUPPORTED_RTC (1<<0) + struct pv_init_ops { /* * Patch may replace one of the defined code sequences with diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 19577dd325fa..b7692daeaf92 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -472,6 +472,7 @@ static inline unsigned long current_top_of_stack(void) #else #define __cpuid native_cpuid #define paravirt_enabled() 0 +#define paravirt_has(x) 0 static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9d014b82a124..6b2c8229f9da 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -999,6 +999,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; + /* If this CPU is offline, just bail out. */ + if (cpu_is_offline(smp_processor_id())) { + u64 mcgstatus; + + mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (mcgstatus & MCG_STATUS_RIPV) { + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + return; + } + } + ist_enter(regs); this_cpu_inc(mce_exception_count); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 02693dd9a079..f660d63f40fe 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the iMac10,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac10,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"), + }, + }, /* ASRock */ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index cd9685235df9..4af8d063fb36 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -200,6 +200,9 @@ static __init int add_rtc_cmos(void) } #endif + if (paravirt_enabled() && !paravirt_has(RTC)) + return -ENODEV; + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6bb5c7f..7d2b2ed33dee 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -688,12 +688,15 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(failed, ksig, stepping); } -#ifdef CONFIG_X86_32 -#define NR_restart_syscall __NR_restart_syscall -#else /* !CONFIG_X86_32 */ -#define NR_restart_syscall \ - test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall -#endif /* CONFIG_X86_32 */ +static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || !defined(CONFIG_X86_64) + return __NR_restart_syscall; +#else /* !CONFIG_X86_32 && CONFIG_X86_64 */ + return test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : + __NR_restart_syscall | (regs->orig_ax & __X32_SYSCALL_BIT); +#endif /* CONFIG_X86_32 || !CONFIG_X86_64 */ +} /* * Note that 'init' is a special process: it doesn't get signals it doesn't @@ -722,7 +725,7 @@ void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = NR_restart_syscall; + regs->ax = get_nr_restart_syscall(regs); regs->ip -= 2; break; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 892ee2e5ecbc..fbabe4fcc7fb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -509,7 +509,7 @@ void __inquire_remote_apic(int apicid) */ #define UDELAY_10MS_DEFAULT 10000 -static unsigned int init_udelay = INT_MAX; +static unsigned int init_udelay = UINT_MAX; static int __init cpu_init_udelay(char *str) { @@ -522,14 +522,15 @@ early_param("cpu_init_udelay", cpu_init_udelay); static void __init smp_quirk_init_udelay(void) { /* if cmdline changed it from default, leave it alone */ - if (init_udelay != INT_MAX) + if (init_udelay != UINT_MAX) return; /* if modern processor, use no delay */ if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || - ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) + ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { init_udelay = 0; - + return; + } /* else, use legacy delay */ init_udelay = UDELAY_10MS_DEFAULT; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d7f89387ba0c..22d181350ec9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1108,6 +1108,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); set_exception_intercept(svm, AC_VECTOR); + set_exception_intercept(svm, DB_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1642,20 +1643,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_bp_intercept(struct kvm_vcpu *vcpu) +static void update_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - clr_exception_intercept(svm, DB_VECTOR); clr_exception_intercept(svm, BP_VECTOR); - if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else @@ -1761,7 +1755,6 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -3761,7 +3754,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4383,7 +4375,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .update_db_bp_intercept = update_db_bp_intercept, + .update_db_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 4eae7c35ddf5..08b668cb3462 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -250,7 +250,7 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(MC) + EXS(MF), EXS(AC), EXS(MC) /* * Tracepoint for kvm interrupt injection: diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 343d3692dd65..2e0bd4884652 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3644,20 +3644,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; - /* - * SMEP/SMAP is disabled if CPU is in non-paging mode - * in hardware. However KVM always uses paging mode to - * emulate guest non-paging mode with TDP. - * To emulate this behavior, SMEP/SMAP needs to be - * manually disabled when guest switches to non-paging - * mode. - */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } } + if (!enable_unrestricted_guest && !is_paging(vcpu)) + /* + * SMEP/SMAP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode without + * unrestricted guest. + * To emulate this behavior, SMEP/SMAP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 43609af03283..37bbbf842350 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -942,7 +942,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS + MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, }; static unsigned num_msrs_to_save; @@ -3847,16 +3847,17 @@ static void kvm_init_msr_list(void) /* * Even MSRs that are valid in the host may not be exposed - * to the guests in some cases. We could work around this - * in VMX with the generic MSR save/load machinery, but it - * is not really worthwhile since it will really only - * happen with nested virtualization. + * to the guests in some cases. */ switch (msrs_to_save[i]) { case MSR_IA32_BNDCFGS: if (!kvm_x86_ops->mpx_supported()) continue; break; + case MSR_TSC_AUX: + if (!kvm_x86_ops->rdtscp_supported()) + continue; + break; default: break; } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a0d09f6c6533..a43b2eafc466 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1414,6 +1414,7 @@ __init void lguest_init(void) pv_info.kernel_rpl = 1; /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; + pv_info.features = 0; /* * We set up all the lguest overrides for sensitive operations. These diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 71fc79a58a15..78e47ff74f9d 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value) == 1) + if (X86_REX_X(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8ddb5d0d66fb..8f4cc3dfac32 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,7 +161,10 @@ void flush_tlb_current_task(void) preempt_disable(); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + + /* This is an implicit full barrier that synchronizes with switch_mm. */ local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); - if (current->active_mm != mm) + if (current->active_mm != mm) { + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; + } if (!current->mm) { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; + /* + * Both branches below are implicit full barriers (MOV to CR or + * INVLPG) that synchronize with switch_mm. + */ if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) preempt_disable(); if (current->active_mm == mm) { - if (current->mm) + if (current->mm) { + /* + * Implicit full barrier (INVLPG) that synchronizes + * with switch_mm. + */ __flush_tlb_one(start); - else + } else { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + } } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 993b7a71386d..aeb385d86e95 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1191,7 +1191,7 @@ static const struct pv_info xen_info __initconst = { #ifdef CONFIG_X86_64 .extra_user_64bit_cs = FLAT_USER_CS64, #endif - + .features = 0, .name = "Xen", }; @@ -1534,6 +1534,8 @@ asmlinkage __visible void __init xen_start_kernel(void) /* Install Xen paravirt ops */ pv_info = xen_info; + if (xen_initial_domain()) + pv_info.features |= PV_SUPPORTED_RTC; pv_init_ops = xen_init_ops; pv_apic_ops = xen_apic_ops; if (!xen_pvh_domain()) { diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index feddabdab448..4299aa924b9f 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -33,7 +33,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + if (!suspend_cancelled) + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { |