diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/powerpc/include/asm/kvm_book3s.h | 1 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_book3s_asm.h | 2 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_host.h | 9 | ||||
-rw-r--r-- | arch/powerpc/include/asm/kvm_ppc.h | 2 | ||||
-rw-r--r-- | arch/powerpc/include/asm/ppc-opcode.h | 2 | ||||
-rw-r--r-- | arch/powerpc/include/uapi/asm/kvm.h | 6 | ||||
-rw-r--r-- | arch/powerpc/kernel/asm-offsets.c | 3 | ||||
-rw-r--r-- | arch/powerpc/kernel/mce.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 562 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_builtin.c | 2 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_interrupts.S | 20 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_ras.c | 18 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 236 | ||||
-rw-r--r-- | arch/powerpc/kvm/emulate.c | 4 | ||||
-rw-r--r-- | arch/powerpc/kvm/powerpc.c | 40 |
15 files changed, 697 insertions, 211 deletions
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 2bf35017ffc0..b8d5b8e35244 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -86,7 +86,6 @@ struct kvmppc_vcore { u16 last_cpu; u8 vcore_state; u8 in_guest; - struct kvmppc_vcore *master_vcore; struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index b148496ffe36..7cea76f11c26 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -81,7 +81,7 @@ struct kvm_split_mode { u8 subcore_size; u8 do_nap; u8 napped[MAX_SMT_THREADS]; - struct kvmppc_vcore *master_vcs[MAX_SUBCORES]; + struct kvmppc_vcore *vc[MAX_SUBCORES]; }; /* diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 50e0bc9723cc..8b3f1238d07f 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/hvcall.h> +#include <asm/mce.h> #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -267,6 +268,8 @@ struct kvm_resize_hpt; struct kvm_arch { unsigned int lpid; + unsigned int smt_mode; /* # vcpus per virtual core */ + unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE unsigned int tlb_sets; struct kvm_hpt_info hpt; @@ -285,6 +288,7 @@ struct kvm_arch { cpumask_t need_tlb_flush; cpumask_t cpu_in_guest; u8 radix; + u8 fwnmi_enabled; pgd_t *pgtable; u64 process_table; struct dentry *debugfs_dir; @@ -566,6 +570,7 @@ struct kvm_vcpu_arch { ulong wort; ulong tid; ulong psscr; + ulong hfscr; ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ @@ -579,7 +584,7 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - u32 dec; + ulong dec; #ifdef CONFIG_BOOKE u32 decar; #endif @@ -710,6 +715,7 @@ struct kvm_vcpu_arch { unsigned long pending_exceptions; u8 ceded; u8 prodded; + u8 doorbell_request; u32 last_inst; struct swait_queue_head *wqp; @@ -722,6 +728,7 @@ struct kvm_vcpu_arch { int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; + struct machine_check_event mce_evt; /* Valid if trap == 0x200 */ struct kvm_vcpu_arch_shared *shared; #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e0d88c38602b..ba5fadd6f3c9 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -315,6 +315,8 @@ struct kvmppc_ops { struct irq_bypass_producer *); int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); + int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, + unsigned long flags); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3a8d278e7421..1a9b45198c06 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -103,6 +103,8 @@ #define OP_31_XOP_STBUX 247 #define OP_31_XOP_LHZX 279 #define OP_31_XOP_LHZUX 311 +#define OP_31_XOP_MSGSNDP 142 +#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MFSPR 339 #define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 07fbeb927834..8cf8f0c96906 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -60,6 +60,12 @@ struct kvm_regs { #define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */ +/* flags for kvm_run.flags */ +#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0) +#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0) +#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0) +#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0) + /* * Feature bits indicate which sections of the sregs struct are valid, * both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e23425317..ae8e89e0d083 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -485,6 +485,7 @@ int main(void) OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); + OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); @@ -513,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); @@ -542,6 +544,7 @@ int main(void) OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); OFFSET(VCPU_TID, kvm_vcpu, arch.tid); OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); + OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada3519b..a9bfa49f3698 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -405,6 +405,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } } +EXPORT_SYMBOL_GPL(machine_check_print_event_info); uint64_t get_mce_fault_addr(struct machine_check_event *evt) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 42b7a4fd57d9..c6313c5d331c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -46,6 +46,8 @@ #include <linux/of.h> #include <asm/reg.h> +#include <asm/ppc-opcode.h> +#include <asm/disassemble.h> #include <asm/cputable.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, unsigned long stolen; unsigned long core_stolen; u64 now; + unsigned long flags; dt = vcpu->arch.dtl_ptr; vpa = vcpu->arch.vpa.pinned_addr; @@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, core_stolen = vcore_stolen_time(vc, now); stolen = core_stolen - vcpu->arch.stolen_logged; vcpu->arch.stolen_logged = core_stolen; - spin_lock_irq(&vcpu->arch.tbacct_lock); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); stolen += vcpu->arch.busy_stolen; vcpu->arch.busy_stolen = 0; - spin_unlock_irq(&vcpu->arch.tbacct_lock); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); if (!dt || !vpa) return; memset(dt, 0, sizeof(struct dtl_entry)); @@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, vcpu->arch.dtl.dirty = true; } +/* See if there is a doorbell interrupt pending for a vcpu */ +static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) +{ + int thr; + struct kvmppc_vcore *vc; + + if (vcpu->arch.doorbell_request) + return true; + /* + * Ensure that the read of vcore->dpdes comes after the read + * of vcpu->doorbell_request. This barrier matches the + * lwsync in book3s_hv_rmhandlers.S just before the + * fast_guest_return label. + */ + smp_rmb(); + vc = vcpu->arch.vcore; + thr = vcpu->vcpu_id - vc->first_vcpuid; + return !!(vc->dpdes & (1 << thr)); +} + static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) { if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) @@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run, } } +static void do_nothing(void *x) +{ +} + +static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu) +{ + int thr, cpu, pcpu, nthreads; + struct kvm_vcpu *v; + unsigned long dpdes; + + nthreads = vcpu->kvm->arch.emul_smt_mode; + dpdes = 0; + cpu = vcpu->vcpu_id & ~(nthreads - 1); + for (thr = 0; thr < nthreads; ++thr, ++cpu) { + v = kvmppc_find_vcpu(vcpu->kvm, cpu); + if (!v) + continue; + /* + * If the vcpu is currently running on a physical cpu thread, + * interrupt it in order to pull it out of the guest briefly, + * which will update its vcore->dpdes value. + */ + pcpu = READ_ONCE(v->cpu); + if (pcpu >= 0) + smp_call_function_single(pcpu, do_nothing, NULL, 1); + if (kvmppc_doorbell_pending(v)) + dpdes |= 1 << thr; + } + return dpdes; +} + +/* + * On POWER9, emulate doorbell-related instructions in order to + * give the guest the illusion of running on a multi-threaded core. + * The instructions emulated are msgsndp, msgclrp, mfspr TIR, + * and mfspr DPDES. + */ +static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) +{ + u32 inst, rb, thr; + unsigned long arg; + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tvcpu; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return EMULATE_FAIL; + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE) + return RESUME_GUEST; + if (get_op(inst) != 31) + return EMULATE_FAIL; + rb = get_rb(inst); + thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1); + switch (get_xop(inst)) { + case OP_31_XOP_MSGSNDP: + arg = kvmppc_get_gpr(vcpu, rb); + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) + break; + arg &= 0x3f; + if (arg >= kvm->arch.emul_smt_mode) + break; + tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg); + if (!tvcpu) + break; + if (!tvcpu->arch.doorbell_request) { + tvcpu->arch.doorbell_request = 1; + kvmppc_fast_vcpu_kick_hv(tvcpu); + } + break; + case OP_31_XOP_MSGCLRP: + arg = kvmppc_get_gpr(vcpu, rb); + if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER) + break; + vcpu->arch.vcore->dpdes = 0; + vcpu->arch.doorbell_request = 0; + break; + case OP_31_XOP_MFSPR: + switch (get_sprn(inst)) { + case SPRN_TIR: + arg = thr; + break; + case SPRN_DPDES: + arg = kvmppc_read_dpdes(vcpu); + break; + default: + return EMULATE_FAIL; + } + kvmppc_set_gpr(vcpu, get_rt(inst), arg); + break; + default: + return EMULATE_FAIL; + } + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + return RESUME_GUEST; +} + static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: - /* - * Deliver a machine check interrupt to the guest. - * We have to do this, even if the host has handled the - * machine check, because machine checks use SRR0/1 and - * the interrupt might have trashed guest state in them. - */ - kvmppc_book3s_queue_irqprio(vcpu, - BOOK3S_INTERRUPT_MACHINE_CHECK); - r = RESUME_GUEST; + /* Exit to guest with KVM_EXIT_NMI as exit reason */ + run->exit_reason = KVM_EXIT_NMI; + run->hw.hardware_exit_reason = vcpu->arch.trap; + /* Clear out the old NMI status from run->flags */ + run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK; + /* Now set the NMI status */ + if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED) + run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV; + else + run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV; + + r = RESUME_HOST; + /* Print the MCE event to host console. */ + machine_check_print_event_info(&vcpu->arch.mce_evt, false); break; case BOOK3S_INTERRUPT_PROGRAM: { @@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; /* * This occurs if the guest (kernel or userspace), does something that - * is prohibited by HFSCR. We just generate a program interrupt to - * the guest. + * is prohibited by HFSCR. + * On POWER9, this could be a doorbell instruction that we need + * to emulate. + * Otherwise, we just generate a program interrupt to the guest. */ case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - r = RESUME_GUEST; + r = EMULATE_FAIL; + if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) + r = kvmppc_emulate_doorbell_instr(vcpu); + if (r == EMULATE_FAIL) { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + } break; case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; @@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; if (cpu_has_feature(CPU_FTR_ARCH_207S)) mask |= LPCR_AIL; + /* + * On POWER9, allow userspace to enable large decrementer for the + * guest, whether or not the host has it enabled. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + mask |= LPCR_LD; /* Broken 32-bit version of LPCR must not clear top bits */ if (preserve_top32) @@ -1486,6 +1622,14 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET: + /* + * POWER9 DD1 has an erratum where writing TBU40 causes + * the timebase to lose ticks. So we don't let the + * timebase offset be changed on P9 DD1. (It is + * initialized to zero.) + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + break; /* round up to multiple of 2^24 */ vcpu->arch.vcore->tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24); @@ -1603,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; - vcore->first_vcpuid = core * threads_per_vcore(); + vcore->first_vcpuid = core * kvm->arch.smt_mode; vcore->kvm = kvm; INIT_LIST_HEAD(&vcore->preempt_list); @@ -1762,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; - int err = -EINVAL; + int err; int core; struct kvmppc_vcore *vcore; - core = id / threads_per_vcore(); - if (core >= KVM_MAX_VCORES) - goto out; - err = -ENOMEM; vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) @@ -1800,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.busy_preempt = TB_NIL; vcpu->arch.intr_msr = MSR_SF | MSR_ME; + /* + * Set the default HFSCR for the guest from the host value. + * This value is only used on POWER9. + * On POWER9 DD1, TM doesn't work, so we make sure to + * prevent the guest from using it. + * On POWER9, we want to virtualize the doorbell facility, so we + * turn off the HFSCR bit, which causes those instructions to trap. + */ + vcpu->arch.hfscr = mfspr(SPRN_HFSCR); + if (!cpu_has_feature(CPU_FTR_TM)) + vcpu->arch.hfscr &= ~HFSCR_TM; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + vcpu->arch.hfscr &= ~HFSCR_MSGP; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; @@ -1807,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, init_waitqueue_head(&vcpu->arch.cpu_run); mutex_lock(&kvm->lock); - vcore = kvm->arch.vcores[core]; - if (!vcore) { - vcore = kvmppc_vcore_create(kvm, core); - kvm->arch.vcores[core] = vcore; - kvm->arch.online_vcores++; + vcore = NULL; + err = -EINVAL; + core = id / kvm->arch.smt_mode; + if (core < KVM_MAX_VCORES) { + vcore = kvm->arch.vcores[core]; + if (!vcore) { + err = -ENOMEM; + vcore = kvmppc_vcore_create(kvm, core); + kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; + } } mutex_unlock(&kvm->lock); @@ -1839,6 +1999,43 @@ out: return ERR_PTR(err); } +static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, + unsigned long flags) +{ + int err; + int esmt = 0; + + if (flags) + return -EINVAL; + if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode)) + return -EINVAL; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* + * On POWER8 (or POWER7), the threading mode is "strict", + * so we pack smt_mode vcpus per vcore. + */ + if (smt_mode > threads_per_subcore) + return -EINVAL; + } else { + /* + * On POWER9, the threading mode is "loose", + * so each vcpu gets its own vcore. + */ + esmt = smt_mode; + smt_mode = 1; + } + mutex_lock(&kvm->lock); + err = -EBUSY; + if (!kvm->arch.online_vcores) { + kvm->arch.smt_mode = smt_mode; + kvm->arch.emul_smt_mode = esmt; + err = 0; + } + mutex_unlock(&kvm->lock); + + return err; +} + static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) { if (vpa->pinned_addr) @@ -1889,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu) } } -extern void __kvmppc_vcore_entry(void); +extern int __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, struct kvm_vcpu *vcpu) @@ -1954,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } -static void do_nothing(void *x) -{ -} - static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) { int i; @@ -1975,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) smp_call_function_single(cpu + i, do_nothing, NULL, 1); } +static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) +{ + struct kvm *kvm = vcpu->kvm; + + /* + * With radix, the guest can do TLB invalidations itself, + * and it could choose to use the local form (tlbiel) if + * it is invalidating a translation that has only ever been + * used on one vcpu. However, that doesn't mean it has + * only ever been used on one physical cpu, since vcpus + * can move around between pcpus. To cope with this, when + * a vcpu moves from one pcpu to another, we need to tell + * any vcpus running on the same core as this vcpu previously + * ran to flush the TLB. The TLB is shared between threads, + * so we use a single bit in .need_tlb_flush for all 4 threads. + */ + if (vcpu->arch.prev_cpu != pcpu) { + if (vcpu->arch.prev_cpu >= 0 && + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + cpu_first_thread_sibling(pcpu)) + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); + vcpu->arch.prev_cpu = pcpu; + } +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; - struct kvmppc_vcore *mvc = vc->master_vcore; struct kvm *kvm = vc->kvm; cpu = vc->pcpu; @@ -1989,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) vcpu->arch.timer_running = 0; } cpu += vcpu->arch.ptid; - vcpu->cpu = mvc->pcpu; + vcpu->cpu = vc->pcpu; vcpu->arch.thread_cpu = cpu; - - /* - * With radix, the guest can do TLB invalidations itself, - * and it could choose to use the local form (tlbiel) if - * it is invalidating a translation that has only ever been - * used on one vcpu. However, that doesn't mean it has - * only ever been used on one physical cpu, since vcpus - * can move around between pcpus. To cope with this, when - * a vcpu moves from one pcpu to another, we need to tell - * any vcpus running on the same core as this vcpu previously - * ran to flush the TLB. The TLB is shared between threads, - * so we use a single bit in .need_tlb_flush for all 4 threads. - */ - if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { - if (vcpu->arch.prev_cpu >= 0 && - cpu_first_thread_sibling(vcpu->arch.prev_cpu) != - cpu_first_thread_sibling(cpu)) - radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); - vcpu->arch.prev_cpu = cpu; - } cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; - tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; + tpaca->kvm_hstate.ptid = cpu - vc->pcpu; /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ smp_wmb(); - tpaca->kvm_hstate.kvm_vcore = mvc; + tpaca->kvm_hstate.kvm_vcore = vc; if (cpu != smp_processor_id()) kvmppc_ipi_thread(cpu); } @@ -2147,8 +2344,7 @@ struct core_info { int max_subcore_threads; int total_threads; int subcore_threads[MAX_SUBCORES]; - struct kvm *subcore_vm[MAX_SUBCORES]; - struct list_head vcs[MAX_SUBCORES]; + struct kvmppc_vcore *vc[MAX_SUBCORES]; }; /* @@ -2159,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) { - int sub; - memset(cip, 0, sizeof(*cip)); cip->n_subcores = 1; cip->max_subcore_threads = vc->num_threads; cip->total_threads = vc->num_threads; cip->subcore_threads[0] = vc->num_threads; - cip->subcore_vm[0] = vc->kvm; - for (sub = 0; sub < MAX_SUBCORES; ++sub) - INIT_LIST_HEAD(&cip->vcs[sub]); - list_add_tail(&vc->preempt_list, &cip->vcs[0]); + cip->vc[0] = vc; } static bool subcore_config_ok(int n_subcores, int n_threads) @@ -2189,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads) return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; } -static void init_master_vcore(struct kvmppc_vcore *vc) +static void init_vcore_to_run(struct kvmppc_vcore *vc) { - vc->master_vcore = vc; vc->entry_exit_map = 0; vc->in_guest = 0; vc->napping_threads = 0; @@ -2216,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) ++cip->n_subcores; cip->total_threads += vc->num_threads; cip->subcore_threads[sub] = vc->num_threads; - cip->subcore_vm[sub] = vc->kvm; - init_master_vcore(vc); - list_move_tail(&vc->preempt_list, &cip->vcs[sub]); + cip->vc[sub] = vc; + init_vcore_to_run(vc); + list_del_init(&vc->preempt_list); return true; } @@ -2286,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } +static bool recheck_signals(struct core_info *cip) +{ + int sub, i; + struct kvm_vcpu *vcpu; + + for (sub = 0; sub < cip->n_subcores; ++sub) + for_each_runnable_thread(i, vcpu, cip->vc[sub]) + if (signal_pending(vcpu->arch.run_task)) + return true; + return false; +} + static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { int still_running = 0, i; @@ -2323,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) wake_up(&vcpu->arch.cpu_run); } } - list_del_init(&vc->preempt_list); if (!is_master) { if (still_running > 0) { kvmppc_vcore_preempt(vc); @@ -2385,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu) return 0; } +static void set_irq_happened(int trap) +{ + switch (trap) { + case BOOK3S_INTERRUPT_EXTERNAL: + local_paca->irq_happened |= PACA_IRQ_EE; + break; + case BOOK3S_INTERRUPT_H_DOORBELL: + local_paca->irq_happened |= PACA_IRQ_DBELL; + break; + case BOOK3S_INTERRUPT_HMI: + local_paca->irq_happened |= PACA_IRQ_HMI; + break; + } +} + /* * Run a set of guest threads on a physical core. * Called with vc->lock held. @@ -2395,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int i; int srcu_idx; struct core_info core_info; - struct kvmppc_vcore *pvc, *vcnext; + struct kvmppc_vcore *pvc; struct kvm_split_mode split_info, *sip; int split, subcore_size, active; int sub; @@ -2404,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int pcpu, thr; int target_threads; int controlled_threads; + int trap; /* * Remove from the list any threads that have a signal pending @@ -2418,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Initialize *vc. */ - init_master_vcore(vc); + init_vcore_to_run(vc); vc->preempt_tb = TB_NIL; /* @@ -2455,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) if (vc->num_threads < target_threads) collect_piggybacks(&core_info, target_threads); + /* + * On radix, arrange for TLB flushing if necessary. + * This has to be done before disabling interrupts since + * it uses smp_call_function(). + */ + pcpu = smp_processor_id(); + if (kvm_is_radix(vc->kvm)) { + for (sub = 0; sub < core_info.n_subcores; ++sub) + for_each_runnable_thread(i, vcpu, core_info.vc[sub]) + kvmppc_prepare_radix_vcpu(vcpu, pcpu); + } + + /* + * Hard-disable interrupts, and check resched flag and signals. + * If we need to reschedule or deliver a signal, clean up + * and return without going into the guest(s). + */ + local_irq_disable(); + hard_irq_disable(); + if (lazy_irq_pending() || need_resched() || + recheck_signals(&core_info)) { + local_irq_enable(); + vc->vcore_state = VCORE_INACTIVE; + /* Unlock all except the primary vcore */ + for (sub = 1; sub < core_info.n_subcores; ++sub) { + pvc = core_info.vc[sub]; + /* Put back on to the preempted vcores list */ + kvmppc_vcore_preempt(pvc); + spin_unlock(&pvc->lock); + } + for (i = 0; i < controlled_threads; ++i) + kvmppc_release_hwthread(pcpu + i); + return; + } + + kvmppc_clear_host_core(pcpu); + /* Decide on micro-threading (split-core) mode */ subcore_size = threads_per_subcore; cmd_bit = stat_bit = 0; @@ -2478,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) split_info.ldbar = mfspr(SPRN_LDBAR); split_info.subcore_size = subcore_size; for (sub = 0; sub < core_info.n_subcores; ++sub) - split_info.master_vcs[sub] = - list_first_entry(&core_info.vcs[sub], - struct kvmppc_vcore, preempt_list); + split_info.vc[sub] = core_info.vc[sub]; /* order writes to split_info before kvm_split_mode pointer */ smp_wmb(); } - pcpu = smp_processor_id(); for (thr = 0; thr < controlled_threads; ++thr) paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; @@ -2504,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } - kvmppc_clear_host_core(pcpu); - /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { thr = subcore_thread_map[sub]; thr0_done = false; active |= 1 << thr; - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { - pvc->pcpu = pcpu + thr; - for_each_runnable_thread(i, vcpu, pvc) { - kvmppc_start_thread(vcpu, pvc); - kvmppc_create_dtl_entry(vcpu, pvc); - trace_kvm_guest_enter(vcpu); - if (!vcpu->arch.ptid) - thr0_done = true; - active |= 1 << (thr + vcpu->arch.ptid); - } - /* - * We need to start the first thread of each subcore - * even if it doesn't have a vcpu. - */ - if (pvc->master_vcore == pvc && !thr0_done) - kvmppc_start_thread(NULL, pvc); - thr += pvc->num_threads; + pvc = core_info.vc[sub]; + pvc->pcpu = pcpu + thr; + for_each_runnable_thread(i, vcpu, pvc) { + kvmppc_start_thread(vcpu, pvc); + kvmppc_create_dtl_entry(vcpu, pvc); + trace_kvm_guest_enter(vcpu); + if (!vcpu->arch.ptid) + thr0_done = true; + active |= 1 << (thr + vcpu->arch.ptid); } + /* + * We need to start the first thread of each subcore + * even if it doesn't have a vcpu. + */ + if (!thr0_done) + kvmppc_start_thread(NULL, pvc); + thr += pvc->num_threads; } /* @@ -2556,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 0); for (sub = 0; sub < core_info.n_subcores; ++sub) - list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) - spin_unlock(&pvc->lock); + spin_unlock(&core_info.vc[sub]->lock); + + /* + * Interrupts will be enabled once we get into the guest, + * so tell lockdep that we're about to enable interrupts. + */ + trace_hardirqs_on(); guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); - __kvmppc_vcore_entry(); + trap = __kvmppc_vcore_entry(); srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + guest_exit(); + + trace_hardirqs_off(); + set_irq_happened(trap); + spin_lock(&vc->lock); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ vc->vcore_state = VCORE_EXITING; @@ -2594,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) split_info.do_nap = 0; } + kvmppc_set_host_core(pcpu); + + local_irq_enable(); + /* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) { kvmppc_release_hwthread(pcpu + i); @@ -2602,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } - kvmppc_set_host_core(pcpu); - spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); - guest_exit(); - for (sub = 0; sub < core_info.n_subcores; ++sub) - list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], - preempt_list) - post_guest_process(pvc, pvc == vc); + for (sub = 0; sub < core_info.n_subcores; ++sub) { + pvc = core_info.vc[sub]; + post_guest_process(pvc, pvc == vc); + } spin_lock(&vc->lock); preempt_enable(); @@ -2658,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns /= halt_poll_ns_shrink; } +#ifdef CONFIG_KVM_XICS +static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) +{ + if (!xive_enabled()) + return false; + return vcpu->arch.xive_saved_state.pipr < + vcpu->arch.xive_saved_state.cppr; +} +#else +static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) +{ + return false; +} +#endif /* CONFIG_KVM_XICS */ + +static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.pending_exceptions || vcpu->arch.prodded || + kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu)) + return true; + + return false; +} + /* * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded @@ -2668,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) int i; for_each_runnable_thread(i, vcpu, vc) { - if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded || - vcpu->arch.prodded) + if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu)) return 1; } @@ -2811,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) */ if (!signal_pending(current)) { if (vc->vcore_state == VCORE_PIGGYBACK) { - struct kvmppc_vcore *mvc = vc->master_vcore; - if (spin_trylock(&mvc->lock)) { - if (mvc->vcore_state == VCORE_RUNNING && - !VCORE_IS_EXITING(mvc)) { + if (spin_trylock(&vc->lock)) { + if (vc->vcore_state == VCORE_RUNNING && + !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } - spin_unlock(&mvc->lock); + spin_unlock(&vc->lock); } } else if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { @@ -2855,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) break; n_ceded = 0; for_each_runnable_thread(i, v, vc) { - if (!v->arch.pending_exceptions && !v->arch.prodded) + if (!kvmppc_vcpu_woken(v)) n_ceded += v->arch.ceded; else v->arch.ceded = 0; @@ -2907,12 +3188,36 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; int srcu_idx; + unsigned long ebb_regs[3] = {}; /* shut up GCC */ + unsigned long user_tar = 0; + unsigned int user_vrsave; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } + /* + * Don't allow entry with a suspended transaction, because + * the guest entry/exit code will lose it. + * If the guest has TM enabled, save away their TM-related SPRs + * (they will get restored by the TM unavailable interrupt). + */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + (current->thread.regs->msr & MSR_TM)) { + if (MSR_TM_ACTIVE(current->thread.regs->msr)) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = 0; + return -EINVAL; + } + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); + current->thread.tm_texasr = mfspr(SPRN_TEXASR); + current->thread.regs->msr &= ~MSR_TM; + } +#endif + kvmppc_core_prepare_to_enter(vcpu); /* No need to go into the guest when all we'll do is come back out */ @@ -2934,6 +3239,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_all_to_thread(current); + /* Save userspace EBB and other register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + ebb_regs[0] = mfspr(SPRN_EBBHR); + ebb_regs[1] = mfspr(SPRN_EBBRR); + ebb_regs[2] = mfspr(SPRN_BESCR); + user_tar = mfspr(SPRN_TAR); + } + user_vrsave = mfspr(SPRN_VRSAVE); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; @@ -2960,6 +3274,16 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) } } while (is_kvmppc_resume_guest(r)); + /* Restore userspace EBB and other register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + mtspr(SPRN_EBBHR, ebb_regs[0]); + mtspr(SPRN_EBBRR, ebb_regs[1]); + mtspr(SPRN_BESCR, ebb_regs[2]); + mtspr(SPRN_TAR, user_tar); + mtspr(SPRN_FSCR, current->thread.fscr); + } + mtspr(SPRN_VRSAVE, user_vrsave); + out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); @@ -3468,6 +3792,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm_hv_vm_activated(); /* + * Initialize smt_mode depending on processor. + * POWER8 and earlier have to use "strict" threading, where + * all vCPUs in a vcore have to run on the same (sub)core, + * whereas on POWER9 the threads can each run a different + * guest. + */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.smt_mode = threads_per_subcore; + else + kvm->arch.smt_mode = 1; + kvm->arch.emul_smt_mode = 1; + + /* * Create a debugfs directory for the VM */ snprintf(buf, sizeof(buf), "vm%d", current->pid); @@ -3896,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = { #endif .configure_mmu = kvmhv_configure_mmu, .get_rmmu_info = kvmhv_get_rmmu_info, + .set_smt_mode = kvmhv_set_smt_mode, }; static int kvm_init_subcore_bitmap(void) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index ee4c2558c305..90644db9d38e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -307,7 +307,7 @@ void kvmhv_commence_exit(int trap) return; for (i = 0; i < MAX_SUBCORES; ++i) { - vc = sip->master_vcs[i]; + vc = sip->vc[i]; if (!vc) break; do { diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 0fdc4a28970b..dc54373c8780 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -61,13 +61,6 @@ BEGIN_FTR_SECTION std r3, HSTATE_DABR(r13) END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) - /* Hard-disable interrupts */ - mfmsr r10 - std r10, HSTATE_HOST_MSR(r13) - rldicl r10,r10,48,1 - rotldi r10,r10,16 - mtmsrd r10,1 - /* Save host PMU registers */ BEGIN_FTR_SECTION /* Work around P8 PMAE bug */ @@ -121,10 +114,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * Put whatever is in the decrementer into the * hypervisor decrementer. */ +BEGIN_FTR_SECTION + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_KVM(r5) + ld r9, KVM_HOST_LPCR(r6) + andis. r9, r9, LPCR_LD@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mfspr r8,SPRN_DEC mftb r7 - mtspr SPRN_HDEC,r8 +BEGIN_FTR_SECTION + /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */ + bne 32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r8,r8 +32: mtspr SPRN_HDEC,r8 add r8,r8,r7 std r8,HSTATE_DECEXP(r13) @@ -143,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * * R1 = host R1 * R2 = host R2 + * R3 = trap number on this thread * R12 = exit handler id * R13 = PACA */ diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 7ef0993214f3..c356f9a40b24 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) out: /* + * For guest that supports FWNMI capability, hook the MCE event into + * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI + * exit reason. On our way to exit we will pull this event from vcpu + * structure and print it from thread 0 of the core/subcore. + * + * For guest that does not support FWNMI capability (old QEMU): * We are now going enter guest either through machine check * interrupt (for unhandled errors) or will continue from * current HSRR0 (for handled errors) in guest. Hence * queue up the event so that we can log it from host console later. */ - machine_check_queue_event(); + if (vcpu->kvm->arch.fwnmi_enabled) { + /* + * Hook up the mce event on to vcpu structure. + * First clear the old event. + */ + memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt)); + if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) { + vcpu->arch.mce_evt = mce_evt; + } + } else + machine_check_queue_event(); return handled; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bdb3f76ceb6b..6ea4b53f4b16 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -32,12 +32,30 @@ #include <asm/opal.h> #include <asm/xive-regs.h> +/* Sign-extend HDEC if not on POWER9 */ +#define EXTEND_HDEC(reg) \ +BEGIN_FTR_SECTION; \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) /* Values in HSTATE_NAPPING(r13) */ #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 +/* Stack frame offsets for kvmppc_hv_entry */ +#define SFS 160 +#define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_TID (SFS-16) +#define STACK_SLOT_PSSCR (SFS-24) +#define STACK_SLOT_PID (SFS-32) +#define STACK_SLOT_IAMR (SFS-40) +#define STACK_SLOT_CIABR (SFS-48) +#define STACK_SLOT_DAWR (SFS-56) +#define STACK_SLOT_DAWRX (SFS-64) +#define STACK_SLOT_HFSCR (SFS-72) + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -51,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline) std r0, PPC_LR_STKOFF(r1) stdu r1, -112(r1) mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) li r0,MSR_RI andc r0,r10,r0 @@ -135,20 +154,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stb r0, HSTATE_HWTHREAD_REQ(r13) /* - * For external and machine check interrupts, we need - * to call the Linux handler to process the interrupt. - * We do that by jumping to absolute address 0x500 for - * external interrupts, or the machine_check_fwnmi label - * for machine checks (since firmware might have patched - * the vector area at 0x200). The [h]rfid at the end of the - * handler will return to the book3s_hv_interrupts.S code. - * For other interrupts we do the rfid to get back - * to the book3s_hv_interrupts.S code here. + * For external interrupts we need to call the Linux + * handler to process the interrupt. We do that by jumping + * to absolute address 0x500 for external interrupts. + * The [h]rfid at the end of the handler will return to + * the book3s_hv_interrupts.S code. For other interrupts + * we do the rfid to get back to the book3s_hv_interrupts.S + * code here. */ ld r8, 112+PPC_LR_STKOFF(r1) addi r1, r1, 112 ld r7, HSTATE_HOST_MSR(r13) + /* Return the trap number on this thread as the return value */ + mr r3, r12 + /* * If we came back from the guest via a relocation-on interrupt, * we will be in virtual mode at this point, which makes it a @@ -158,62 +178,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) andi. r0, r0, MSR_IR /* in real mode? */ bne .Lvirt_return - cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 11f - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL - beq 15f /* Invoke the H_DOORBELL handler */ - cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI - beq cr2, 14f /* HMI check */ - - /* RFI into the highmem handler, or branch to interrupt handler */ + /* RFI into the highmem handler */ mfmsr r6 li r0, MSR_RI andc r6, r6, r0 mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - beq cr1, 13f /* machine check */ RFI - /* On POWER7, we have external interrupts set to use HSRR0/1 */ -11: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0x500 - -13: b machine_check_fwnmi - -14: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - b hmi_exception_after_realmode - -15: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - ba 0xe80 - - /* Virtual-mode return - can't get here for HMI or machine check */ + /* Virtual-mode return */ .Lvirt_return: - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq 16f - cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL - beq 17f - andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ - beq 18f - mtmsrd r7, 1 /* if so then re-enable them */ -18: mtlr r8 + mtlr r8 blr -16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ - mtspr SPRN_HSRR1, r7 - b exc_virt_0x4500_hardware_interrupt - -17: mtspr SPRN_HSRR0, r8 - mtspr SPRN_HSRR1, r7 - b exc_virt_0x4e80_h_doorbell - kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + /* HDEC may be larger than DEC for arch >= v3.00, but since the */ + /* HDEC value came from DEC in the first place, it will fit */ mfspr r3, SPRN_HDEC mtspr SPRN_DEC, r3 /* @@ -295,8 +278,9 @@ kvm_novcpu_wakeup: /* See if our timeslice has expired (HDEC is negative) */ mfspr r0, SPRN_HDEC + EXTEND_HDEC(r0) li r12, BOOK3S_INTERRUPT_HV_DECREMENTER - cmpwi r0, 0 + cmpdi r0, 0 blt kvm_novcpu_exit /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ @@ -319,10 +303,10 @@ kvm_novcpu_exit: bl kvmhv_accumulate_time #endif 13: mr r3, r12 - stw r12, 112-4(r1) + stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_commence_exit nop - lwz r12, 112-4(r1) + lwz r12, STACK_SLOT_TRAP(r1) b kvmhv_switch_to_host /* @@ -390,8 +374,8 @@ kvm_secondary_got_guest: lbz r4, HSTATE_PTID(r13) cmpwi r4, 0 bne 63f - lis r6, 0x7fff - ori r6, r6, 0xffff + LOAD_REG_ADDR(r6, decrementer_max) + ld r6, 0(r6) mtspr SPRN_HDEC, r6 /* and set per-LPAR registers, if doing dynamic micro-threading */ ld r6, HSTATE_SPLIT_MODE(r13) @@ -545,11 +529,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * * *****************************************************************************/ -/* Stack frame offsets */ -#define STACK_SLOT_TID (112-16) -#define STACK_SLOT_PSSCR (112-24) -#define STACK_SLOT_PID (112-32) - .global kvmppc_hv_entry kvmppc_hv_entry: @@ -565,7 +544,7 @@ kvmppc_hv_entry: */ mflr r0 std r0, PPC_LR_STKOFF(r1) - stdu r1, -112(r1) + stdu r1, -SFS(r1) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) @@ -749,10 +728,22 @@ BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) + std r8, STACK_SLOT_IAMR(r1) + mfspr r5, SPRN_HFSCR + std r5, STACK_SLOT_HFSCR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + mfspr r5, SPRN_CIABR + mfspr r6, SPRN_DAWR + mfspr r7, SPRN_DAWRX + std r5, STACK_SLOT_CIABR(r1) + std r6, STACK_SLOT_DAWR(r1) + std r7, STACK_SLOT_DAWRX(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION /* Set partition DABR */ @@ -895,8 +886,10 @@ FTR_SECTION_ELSE ld r5, VCPU_TID(r4) ld r6, VCPU_PSSCR(r4) oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ + ld r7, VCPU_HFSCR(r4) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 + mtspr SPRN_HFSCR, r7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 8: @@ -911,7 +904,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 - stw r3,VCPU_DEC(r4) + std r3,VCPU_DEC(r4) ld r5, VCPU_SPRG0(r4) ld r6, VCPU_SPRG1(r4) @@ -968,7 +961,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* Check if HDEC expires soon */ mfspr r3, SPRN_HDEC - cmpwi r3, 512 /* 1 microsecond */ + EXTEND_HDEC(r3) + cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon #ifdef CONFIG_KVM_XICS @@ -1022,7 +1016,13 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ li r0, BOOK3S_INTERRUPT_EXTERNAL bne cr1, 12f mfspr r0, SPRN_DEC - cmpwi r0, 0 +BEGIN_FTR_SECTION + /* On POWER9 check whether the guest has large decrementer enabled */ + andis. r8, r8, LPCR_LD@h + bne 15f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + extsw r0, r0 +15: cmpdi r0, 0 li r0, BOOK3S_INTERRUPT_DECREMENTER bge 5f @@ -1032,6 +1032,23 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ mr r9, r4 bl kvmppc_msr_interrupt 5: +BEGIN_FTR_SECTION + b fast_guest_return +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + /* On POWER9, check for pending doorbell requests */ + lbz r0, VCPU_DBELL_REQ(r4) + cmpwi r0, 0 + beq fast_guest_return + ld r5, HSTATE_KVM_VCORE(r13) + /* Set DPDES register so the CPU will take a doorbell interrupt */ + li r0, 1 + mtspr SPRN_DPDES, r0 + std r0, VCORE_DPDES(r5) + /* Make sure other cpus see vcore->dpdes set before dbell req clear */ + lwsync + /* Clear the pending doorbell request */ + li r0, 0 + stb r0, VCPU_DBELL_REQ(r4) /* * Required state: @@ -1206,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) stw r12,VCPU_TRAP(r9) + /* + * Now that we have saved away SRR0/1 and HSRR0/1, + * interrupts are recoverable in principle, so set MSR_RI. + * This becomes important for relocation-on interrupts from + * the guest, which we can get in radix mode on POWER9. + */ + li r0, MSR_RI + mtmsrd r0, 1 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r9, VCPU_TB_RMINTR mr r4, r9 @@ -1262,6 +1288,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) beq 4f b guest_exit_cont 3: + /* If it's a hypervisor facility unavailable interrupt, save HFSCR */ + cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL + bne 14f + mfspr r3, SPRN_HFSCR + std r3, VCPU_HFSCR(r9) + b guest_exit_cont +14: /* External interrupt ? */ cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL bne+ guest_exit_cont @@ -1449,12 +1482,18 @@ mc_cont: mtspr SPRN_SPURR,r4 /* Save DEC */ + ld r3, HSTATE_KVM_VCORE(r13) mfspr r5,SPRN_DEC mftb r6 + /* On P9, if the guest has large decr enabled, don't sign extend */ +BEGIN_FTR_SECTION + ld r4, VCORE_LPCR(r3) + andis. r4, r4, LPCR_LD@h + bne 16f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r5,r5 - add r5,r5,r6 +16: add r5,r5,r6 /* r5 is a guest timebase value here, convert to host TB */ - ld r3,HSTATE_KVM_VCORE(r13) ld r4,VCORE_TB_OFFSET(r3) subf r5,r4,r5 std r5,VCPU_DEC_EXPIRES(r9) @@ -1499,17 +1538,19 @@ FTR_SECTION_ELSE rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ rotldi r6, r6, 60 std r6, VCPU_PSSCR(r9) + /* Restore host HFSCR value */ + ld r7, STACK_SLOT_HFSCR(r1) + mtspr SPRN_HFSCR, r7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* * Restore various registers to 0, where non-zero values * set by the guest could disrupt the host. */ li r0, 0 - mtspr SPRN_IAMR, r0 - mtspr SPRN_CIABR, r0 - mtspr SPRN_DAWRX, r0 + mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION + mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 @@ -1525,6 +1566,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) std r6,VCPU_UAMOR(r9) li r6,0 mtspr SPRN_AMR,r6 + mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ mfspr r8, SPRN_DSCR @@ -1670,12 +1712,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Restore host values of some registers */ BEGIN_FTR_SECTION + ld r5, STACK_SLOT_CIABR(r1) + ld r6, STACK_SLOT_DAWR(r1) + ld r7, STACK_SLOT_DAWRX(r1) + mtspr SPRN_CIABR, r5 + mtspr SPRN_DAWR, r6 + mtspr SPRN_DAWRX, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) + ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 + mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT @@ -1819,8 +1871,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) - ld r0, 112+PPC_LR_STKOFF(r1) - addi r1, r1, 112 + ld r0, SFS+PPC_LR_STKOFF(r1) + addi r1, r1, SFS mtlr r0 blr @@ -2366,12 +2418,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) mfspr r3, SPRN_DEC mfspr r4, SPRN_HDEC mftb r5 - cmpw r3, r4 +BEGIN_FTR_SECTION + /* On P9 check whether the guest has large decrementer mode enabled */ + ld r6, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_LPCR(r6) + andis. r6, r6, LPCR_LD@h + bne 68f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + extsw r3, r3 +68: EXTEND_HDEC(r4) + cmpd r3, r4 ble 67f mtspr SPRN_DEC, r4 67: /* save expiry time of guest decrementer */ - extsw r3, r3 add r3, r3, r5 ld r4, HSTATE_KVM_VCPU(r13) ld r5, HSTATE_KVM_VCORE(r13) @@ -2552,22 +2612,32 @@ machine_check_realmode: ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* - * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through - * machine check interrupt (set HSRR0 to 0x200). And for handled - * errors (no-fatal), just go back to guest execution with current - * HSRR0 instead of exiting guest. This new approach will inject - * machine check to guest for fatal error causing guest to crash. - * - * The old code used to return to host for unhandled errors which - * was causing guest to hang with soft lockups inside guest and - * makes it difficult to recover guest instance. + * For the guest that is FWNMI capable, deliver all the MCE errors + * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit + * reason. This new approach injects machine check errors in guest + * address space to guest with additional information in the form + * of RTAS event, thus enabling guest kernel to suitably handle + * such errors. * + * For the guest that is not FWNMI capable (old QEMU) fallback + * to old behaviour for backward compatibility: + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either + * through machine check interrupt (set HSRR0 to 0x200). + * For handled errors (no-fatal), just go back to guest execution + * with current HSRR0. * if we receive machine check with MSR(RI=0) then deliver it to * guest as machine check causing guest to crash. */ ld r11, VCPU_MSR(r9) rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ bne mc_cont /* if so, exit to host */ + /* Check if guest is capable of handling NMI exit */ + ld r10, VCPU_KVM(r9) + lbz r10, KVM_FWNMI(r10) + cmpdi r10, 1 /* FWNMI capable? */ + beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ + + /* if not, fall through for backward compatibility. */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ beq 1f /* Deliver a machine check to guest */ ld r10, VCPU_PC(r9) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index c873ffe55362..4d8b4d6cebff 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -39,7 +39,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) unsigned long dec_nsec; unsigned long long dec_time; - pr_debug("mtDEC: %x\n", vcpu->arch.dec); + pr_debug("mtDEC: %lx\n", vcpu->arch.dec); hrtimer_try_to_cancel(&vcpu->arch.dec_timer); #ifdef CONFIG_PPC_BOOK3S @@ -109,7 +109,7 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_TBWU: break; case SPRN_DEC: - vcpu->arch.dec = spr_val; + vcpu->arch.dec = (u32) spr_val; kvmppc_emulate_dec(vcpu); break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c5aa3a42883a..1a75c0b5f4ca 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -553,13 +553,28 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: r = 0; - if (hv_enabled) { + if (kvm) { + if (kvm->arch.emul_smt_mode > 1) + r = kvm->arch.emul_smt_mode; + else + r = kvm->arch.smt_mode; + } else if (hv_enabled) { if (cpu_has_feature(CPU_FTR_ARCH_300)) r = 1; else r = threads_per_subcore; } break; + case KVM_CAP_PPC_SMT_POSSIBLE: + r = 1; + if (hv_enabled) { + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + r = ((threads_per_subcore << 1) - 1); + else + /* P9 can emulate dbells, so allow any mode */ + r = 8 | 4 | 2 | 1; + } + break; case KVM_CAP_PPC_RMA: r = 0; break; @@ -618,6 +633,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); break; #endif +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_FWNMI: + r = hv_enabled; + break; +#endif case KVM_CAP_PPC_HTM: r = cpu_has_feature(CPU_FTR_TM_COMP) && is_kvmppc_hv_enabled(kvm); @@ -1537,6 +1557,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } #endif /* CONFIG_KVM_XICS */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_FWNMI: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + break; + r = 0; + vcpu->kvm->arch.fwnmi_enabled = true; + break; +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ default: r = -EINVAL; break; @@ -1711,6 +1740,15 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; break; } + case KVM_CAP_PPC_SMT: { + unsigned long mode = cap->args[0]; + unsigned long flags = cap->args[1]; + + r = -EINVAL; + if (kvm->arch.kvm_ops->set_smt_mode) + r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); + break; + } #endif default: r = -EINVAL; |