diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 241 |
1 files changed, 172 insertions, 69 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70be41b3..ea306adbbc13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP; + if (cr4 & CR4_RESERVED_BITS) return 1; @@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -801,6 +799,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +static void kvm_update_dr0123(struct kvm_vcpu *vcpu) +{ + int i; + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + for (i = 0; i < KVM_NR_DB_REGS; i++) + vcpu->arch.eff_db[i] = vcpu->arch.db[i]; + vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; + } +} + static void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) @@ -1070,19 +1079,19 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = &pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); + boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr.cycle_last; - vdata->clock.mask = tk->tkr.mask; - vdata->clock.mult = tk->tkr.mult; - vdata->clock.shift = tk->tkr.shift; + vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr_mono.cycle_last; + vdata->clock.mask = tk->tkr_mono.mask; + vdata->clock.mult = tk->tkr_mono.mult; + vdata->clock.shift = tk->tkr_mono.shift; vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr.xtime_nsec; + vdata->nsec_base = tk->tkr_mono.xtime_nsec; write_seqcount_end(&vdata->seq); } @@ -1658,12 +1667,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &guest_hv_clock, sizeof(guest_hv_clock)))) return 0; - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. */ - vcpu->hv_clock.version = guest_hv_clock.version + 2; + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1684,6 +1709,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); return 0; } @@ -2744,7 +2776,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: - case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_IOEVENTFD_NO_LENGTH: case KVM_CAP_PIT2: @@ -3150,6 +3181,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; @@ -4115,8 +4147,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, do { n = min(len, 8); if (!(vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v)) break; handled += n; addr += n; @@ -4135,8 +4167,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) do { n = min(len, 8); if (!(vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) - && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev, + addr, n, v)) + && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v)) break; trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); handled += n; @@ -4476,7 +4509,8 @@ mmio: return X86EMUL_CONTINUE; } -int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, +static int emulator_read_write(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *exception, const struct read_write_emulator_ops *ops) @@ -4539,7 +4573,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, exception, &read_emultor); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, +static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, const void *val, unsigned int bytes, @@ -4630,10 +4664,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); return r; @@ -4706,7 +4740,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4723,19 +4757,29 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) wbinvd(); return X86EMUL_CONTINUE; } + +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_emulate_wbinvd_noskip(vcpu); +} EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); + + static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) { - kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); + kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest) { return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long value) { return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); @@ -5776,7 +5820,6 @@ int kvm_arch_init(void *opaque) kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; - kvm_init_msr_list(); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5817,7 +5860,7 @@ void kvm_arch_exit(void) free_percpu(shared_msrs); } -int kvm_emulate_halt(struct kvm_vcpu *vcpu) +int kvm_vcpu_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; if (irqchip_in_kernel(vcpu->kvm)) { @@ -5828,6 +5871,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) return 0; } } +EXPORT_SYMBOL_GPL(kvm_vcpu_halt); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_vcpu_halt(vcpu); +} EXPORT_SYMBOL_GPL(kvm_emulate_halt); int kvm_hv_hypercall(struct kvm_vcpu *vcpu) @@ -5904,7 +5954,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.dest_id = apicid; lapic_irq.delivery_mode = APIC_DM_REMRD; - kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL); + kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -5912,6 +5962,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit, r = 1; + kvm_x86_ops->skip_emulated_instruction(vcpu); + if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -6143,6 +6195,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); /* @@ -6165,7 +6219,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, } /* - * Returns 1 to let __vcpu_run() continue the guest execution loop without + * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. */ @@ -6302,6 +6356,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); set_debugreg(vcpu->arch.dr6, 6); + vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } trace_kvm_entry(vcpu->vcpu_id); @@ -6383,42 +6438,47 @@ out: return r; } +static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) +{ + if (!kvm_arch_vcpu_runnable(vcpu)) { + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); + kvm_vcpu_block(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + return 1; + } + + kvm_apic_accept_events(vcpu); + switch(vcpu->arch.mp_state) { + case KVM_MP_STATE_HALTED: + vcpu->arch.pv.pv_unhalted = false; + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + case KVM_MP_STATE_RUNNABLE: + vcpu->arch.apf.halted = false; + break; + case KVM_MP_STATE_INIT_RECEIVED: + break; + default: + return -EINTR; + break; + } + return 1; +} -static int __vcpu_run(struct kvm_vcpu *vcpu) +static int vcpu_run(struct kvm_vcpu *vcpu) { int r; struct kvm *kvm = vcpu->kvm; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - r = 1; - while (r > 0) { + for (;;) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted) r = vcpu_enter_guest(vcpu); - else { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { - kvm_apic_accept_events(vcpu); - switch(vcpu->arch.mp_state) { - case KVM_MP_STATE_HALTED: - vcpu->arch.pv.pv_unhalted = false; - vcpu->arch.mp_state = - KVM_MP_STATE_RUNNABLE; - case KVM_MP_STATE_RUNNABLE: - vcpu->arch.apf.halted = false; - break; - case KVM_MP_STATE_INIT_RECEIVED: - break; - default: - r = -EINTR; - break; - } - } - } - + else + r = vcpu_block(kvm, vcpu); if (r <= 0) break; @@ -6430,6 +6490,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; + break; } kvm_check_async_pf_completion(vcpu); @@ -6438,6 +6499,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; + break; } if (need_resched()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); @@ -6569,7 +6631,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - r = __vcpu_run(vcpu); + r = vcpu_run(vcpu); out: post_kvm_run_save(vcpu); @@ -6998,7 +7060,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + if (!vcpu->arch.eager_fpu) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + trace_kvm_fpu(0); } @@ -7014,11 +7078,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + /* + * Activate fpu unconditionally in case the guest needs eager FPU. It will be + * deactivated soon if it doesn't. + */ + kvm_x86_ops->fpu_activate(vcpu); + return vcpu; } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) @@ -7076,11 +7150,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); + kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); + vcpu->arch.cr2 = 0; + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; vcpu->arch.st.msr_val = 0; @@ -7210,7 +7287,14 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void) { - return kvm_x86_ops->hardware_setup(); + int r; + + r = kvm_x86_ops->hardware_setup(); + if (r != 0) + return r; + + kvm_init_msr_list(); + return 0; } void kvm_arch_hardware_unsetup(void) @@ -7241,7 +7325,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) + if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; @@ -7289,6 +7373,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = 0; vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); @@ -7429,7 +7515,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvm_kvfree(free->arch.rmap[i]); + kvfree(free->arch.rmap[i]); free->arch.rmap[i] = NULL; } if (i == 0) @@ -7437,7 +7523,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, if (!dont || free->arch.lpage_info[i - 1] != dont->arch.lpage_info[i - 1]) { - kvm_kvfree(free->arch.lpage_info[i - 1]); + kvfree(free->arch.lpage_info[i - 1]); free->arch.lpage_info[i - 1] = NULL; } } @@ -7491,12 +7577,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, out_free: for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvm_kvfree(slot->arch.rmap[i]); + kvfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; if (i == 0) continue; - kvm_kvfree(slot->arch.lpage_info[i - 1]); + kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; @@ -7619,6 +7705,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, new = id_to_memslot(kvm->memslots, mem->slot); /* + * Dirty logging tracks sptes in 4k granularity, meaning that large + * sptes have to be split. If live migration is successful, the guest + * in the source machine will be destroyed and large sptes will be + * created in the destination. However, if the guest continues to run + * in the source machine (for example if live migration fails), small + * sptes will remain around and cause bad performance. + * + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. + */ + if ((change != KVM_MR_DELETE) && + (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_zap_collapsible_sptes(kvm, new); + + /* * Set up write protection and/or dirty logging for the new slot. * * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have |