diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 638 |
1 files changed, 437 insertions, 201 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cdac9e592aa5..f1e4025f1ae2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -43,6 +43,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/uaccess.h> +#include <linux/hash.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -80,9 +81,10 @@ * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -155,9 +157,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -static inline u32 bit(int bitno) +static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { - return 1 << (bitno & 31); + int i; + for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + vcpu->arch.apf.gfns[i] = ~0; } static void kvm_on_user_return(struct user_return_notifier *urn) @@ -331,29 +335,34 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu) +void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { - unsigned error_code = vcpu->arch.fault.error_code; + if (err) + kvm_inject_gp(vcpu, 0); + else + kvm_x86_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +{ ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = vcpu->arch.fault.address; - kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); + vcpu->arch.cr2 = fault->address; + kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } -void kvm_propagate_fault(struct kvm_vcpu *vcpu) +void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) - vcpu->arch.nested_mmu.inject_page_fault(vcpu); + if (mmu_is_nested(vcpu) && !fault->nested_page_fault) + vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else - vcpu->arch.mmu.inject_page_fault(vcpu); - - vcpu->arch.fault.nested = false; + vcpu->arch.mmu.inject_page_fault(vcpu, fault); } void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -465,8 +474,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; - offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); + gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT; + offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1); r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) @@ -511,12 +520,17 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } else #endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.cr3)) + kvm_read_cr3(vcpu))) return 1; } kvm_x86_ops->set_cr0(vcpu, cr0); + if ((cr0 ^ old_cr0) & X86_CR0_PG) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } + if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); return 0; @@ -600,7 +614,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + kvm_read_cr3(vcpu))) return 1; if (cr4 & X86_CR4_VMXE) @@ -620,7 +635,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { - if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return 0; @@ -655,12 +670,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) return 1; vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); vcpu->arch.mmu.new_cr3(vcpu); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) return 1; @@ -670,12 +686,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) vcpu->arch.cr8 = cr8; return 0; } - -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) -{ - if (__kvm_set_cr8(vcpu, cr8)) - kvm_inject_gp(vcpu, 0); -} EXPORT_SYMBOL_GPL(kvm_set_cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) @@ -780,12 +790,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 7 +#define KVM_SAVE_MSRS_BEGIN 8 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_APIC_ASSIST_PAGE, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -835,7 +845,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - kvm_mmu_reset_context(vcpu); /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -981,7 +990,7 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __get_cpu_var(cpu_tsc_khz); + ret = nsec * __this_cpu_read(cpu_tsc_khz); do_div(ret, USEC_PER_SEC); return ret; } @@ -1011,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) unsigned long flags; s64 sdiff; - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1044,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; @@ -1066,7 +1075,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + this_tsc_khz = __this_cpu_read(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -1423,6 +1432,38 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } +static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) +{ + gpa_t gpa = data & ~0x3f; + + /* Bits 2:5 are resrved, Should be zero */ + if (data & 0x3c) + return 1; + + vcpu->arch.apf.msr_val = data; + + if (!(data & KVM_ASYNC_PF_ENABLED)) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + return 0; + } + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa)) + return 1; + + vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + kvm_async_pf_wakeup_all(vcpu); + return 0; +} + +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1480,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -1504,6 +1542,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } break; } + case MSR_KVM_ASYNC_PF_EN: + if (kvm_pv_enable_async_pf(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -1558,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else return set_msr_hyperv(vcpu, msr, data); break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1780,6 +1828,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; + case MSR_KVM_ASYNC_PF_EN: + data = vcpu->arch.apf.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1809,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) } else return get_msr_hyperv(vcpu, msr, pdata); break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -1909,6 +1973,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: case KVM_CAP_SYNC_MMU: + case KVM_CAP_USER_NMI: case KVM_CAP_REINJECT_CONTROL: case KVM_CAP_IRQ_INJECT_STATUS: case KVM_CAP_ASSIGN_DEV_IRQ: @@ -1927,6 +1992,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: + case KVM_CAP_ASYNC_PF: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2061,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2190,6 +2256,11 @@ out: return r; } +static void cpuid_mask(u32 *word, int wordnum) +{ + *word &= boot_cpu_data.x86_capability[wordnum]; +} + static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { @@ -2264,7 +2335,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 1: entry->edx &= kvm_supported_word0_x86_features; + cpuid_mask(&entry->edx, 0); entry->ecx &= kvm_supported_word4_x86_features; + cpuid_mask(&entry->ecx, 4); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -2355,7 +2428,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 0x80000001: entry->edx &= kvm_supported_word1_x86_features; + cpuid_mask(&entry->edx, 1); entry->ecx &= kvm_supported_word6_x86_features; + cpuid_mask(&entry->ecx, 6); break; } @@ -2527,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - printk(KERN_DEBUG "kvm: set_mce: " - "injects mce exception while " - "previous one is in progress!\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -2600,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); @@ -3174,20 +3244,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memslots *slots, *old_slots; unsigned long *dirty_bitmap; - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; + dirty_bitmap = memslot->dirty_bitmap_head; + if (memslot->dirty_bitmap == dirty_bitmap) + dirty_bitmap += n / sizeof(long); memset(dirty_bitmap, 0, n); r = -ENOMEM; slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) { - vfree(dirty_bitmap); + if (!slots) goto out; - } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + slots->generation++; old_slots = kvm->memslots; rcu_assign_pointer(kvm->memslots, slots); @@ -3200,11 +3268,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { - vfree(dirty_bitmap); + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) goto out; - } - vfree(dirty_bitmap); } else { r = -EFAULT; if (clear_user(log->dirty_bitmap, n)) @@ -3271,8 +3336,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (vpic) { r = kvm_ioapic_init(kvm); if (r) { + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; } @@ -3283,10 +3350,12 @@ long kvm_arch_vm_ioctl(struct file *filp, smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { + mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_destroy_pic(kvm); mutex_unlock(&kvm->irq_lock); + mutex_unlock(&kvm->slots_lock); } create_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -3562,63 +3631,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) { gpa_t t_gpa; - u32 error; + struct x86_exception exception; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); - if (t_gpa == UNMAPPED_GVA) - vcpu->arch.fault.nested = true; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception); return t_gpa; } -gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } - gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } -gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } /* uses this to access any guest's mapped memory without checking CPL */ -gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, + struct x86_exception *exception) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, u32 access, - u32 *error) + struct x86_exception *exception) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - error); + exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) { - r = X86EMUL_PROPAGATE_FAULT; - goto out; - } + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; @@ -3635,31 +3704,35 @@ out: /* used for instruction fetching */ static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 *error) + struct kvm_vcpu *vcpu, + struct x86_exception *exception) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, error); + access | PFERR_FETCH_MASK, + exception); } static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 *error) + struct kvm_vcpu *vcpu, + struct x86_exception *exception) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, - error); + exception); } static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 *error) + struct kvm_vcpu *vcpu, + struct x86_exception *exception) { - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } static int kvm_write_guest_virt_system(gva_t addr, void *val, unsigned int bytes, struct kvm_vcpu *vcpu, - u32 *error) + struct x86_exception *exception) { void *data = val; int r = X86EMUL_CONTINUE; @@ -3667,15 +3740,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, while (bytes) { gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, PFERR_WRITE_MASK, - error); + exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) { - r = X86EMUL_PROPAGATE_FAULT; - goto out; - } + if (gpa == UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { r = X86EMUL_IO_NEEDED; @@ -3693,7 +3764,7 @@ out: static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, - unsigned int *error_code, + struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3706,7 +3777,7 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception); if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -3715,8 +3786,8 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) - == X86EMUL_CONTINUE) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; mmio: @@ -3740,7 +3811,7 @@ mmio: } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) + const void *val, int bytes) { int ret; @@ -3754,12 +3825,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, - unsigned int *error_code, + struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; @@ -3792,7 +3863,7 @@ mmio: int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, - unsigned int *error_code, + struct x86_exception *exception, struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ @@ -3800,7 +3871,7 @@ int emulator_write_emulated(unsigned long addr, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, error_code, + rc = emulator_write_emulated_onepage(addr, val, now, exception, vcpu); if (rc != X86EMUL_CONTINUE) return rc; @@ -3808,7 +3879,7 @@ int emulator_write_emulated(unsigned long addr, val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, error_code, + return emulator_write_emulated_onepage(addr, val, bytes, exception, vcpu); } @@ -3826,7 +3897,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, - unsigned int *error_code, + struct x86_exception *exception, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3884,7 +3955,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, error_code, vcpu); + return emulator_write_emulated(addr, new, bytes, exception, vcpu); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3909,7 +3980,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, if (vcpu->arch.pio.count) goto data_avail; - trace_kvm_pio(0, port, size, 1); + trace_kvm_pio(0, port, size, count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 1; @@ -3937,7 +4008,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu) { - trace_kvm_pio(1, port, size, 1); + trace_kvm_pio(1, port, size, count); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 0; @@ -3978,13 +4049,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; if (kvm_x86_ops->has_wbinvd_exit()) { - preempt_disable(); + int cpu = get_cpu(); + + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); - preempt_enable(); + put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); - } - wbinvd(); + } else + wbinvd(); return X86EMUL_CONTINUE; } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -4024,7 +4097,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) value = vcpu->arch.cr2; break; case 3: - value = vcpu->arch.cr3; + value = kvm_read_cr3(vcpu); break; case 4: value = kvm_read_cr4(vcpu); @@ -4058,7 +4131,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: - res = __kvm_set_cr8(vcpu, val & 0xfUL); + res = kvm_set_cr8(vcpu, val); break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); @@ -4089,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg, return get_segment_base(vcpu, seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4103,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, var.limit >>= 12; set_desc_limit(desc, var.limit); set_desc_base(desc, (unsigned long)var.base); +#ifdef CONFIG_X86_64 + if (base3) + *base3 = var.base >> 32; +#endif desc->type = var.type; desc->s = var.s; desc->dpl = var.dpl; @@ -4115,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4124,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, kvm_get_segment(vcpu, &var, seg); var.base = get_desc_base(desc); +#ifdef CONFIG_X86_64 + var.base |= ((u64)base3) << 32; +#endif var.limit = get_desc_limit(desc); if (desc->g) var.limit = (var.limit << 12) | 0xfff; @@ -4211,12 +4291,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - if (ctxt->exception == PF_VECTOR) - kvm_propagate_fault(vcpu); - else if (ctxt->error_code_valid) - kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); + if (ctxt->exception.vector == PF_VECTOR) + kvm_propagate_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception.vector, + ctxt->exception.error_code); else - kvm_queue_exception(vcpu, ctxt->exception); + kvm_queue_exception(vcpu, ctxt->exception.vector); } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) @@ -4272,13 +4353,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu) { + int r = EMULATE_DONE; + ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (!is_guest_mode(vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + r = EMULATE_FAIL; + } kvm_queue_exception(vcpu, UD_VECTOR); - return EMULATE_FAIL; + + return r; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) @@ -4307,10 +4394,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) return false; } -int emulate_instruction(struct kvm_vcpu *vcpu, - unsigned long cr2, - u16 error_code, - int emulation_type) +int x86_emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, + int emulation_type, + void *insn, + int insn_len) { int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; @@ -4328,44 +4416,19 @@ int emulate_instruction(struct kvm_vcpu *vcpu, if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); vcpu->arch.emulate_ctxt.interruptibility = 0; - vcpu->arch.emulate_ctxt.exception = -1; + vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.perm_ok = false; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; - - trace_kvm_emulate_insn_start(vcpu); - - /* Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall)*/ - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return EMULATE_FAIL; - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return EMULATE_FAIL; - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - default: - return EMULATE_FAIL; - } + vcpu->arch.emulate_ctxt.only_vendor_specific_insn + = emulation_type & EMULTYPE_TRAP_UD; - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return EMULATE_FAIL; - } + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); + trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; if (r) { + if (emulation_type & EMULTYPE_TRAP_UD) + return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) @@ -4393,8 +4456,7 @@ restart: return handle_emulation_failure(vcpu); } -done: - if (vcpu->arch.emulate_ctxt.exception >= 0) { + if (vcpu->arch.emulate_ctxt.have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; } else if (vcpu->arch.pio.count) { @@ -4418,7 +4480,7 @@ done: return r; } -EXPORT_SYMBOL_GPL(emulate_instruction); +EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { @@ -4432,7 +4494,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out); static void tsc_bad(void *info) { - __get_cpu_var(cpu_tsc_khz) = 0; + __this_cpu_write(cpu_tsc_khz, 0); } static void tsc_khz_changed(void *data) @@ -4446,7 +4508,7 @@ static void tsc_khz_changed(void *data) khz = cpufreq_quick_get(raw_smp_processor_id()); if (!khz) khz = tsc_khz; - __get_cpu_var(cpu_tsc_khz) = khz; + __this_cpu_write(cpu_tsc_khz, khz); } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4503,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4513,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -4569,9 +4631,11 @@ static void kvm_timer_init(void) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; memset(&policy, 0, sizeof(policy)); - cpufreq_get_policy(&policy, get_cpu()); + cpu = get_cpu(); + cpufreq_get_policy(&policy, cpu); if (policy.cpuinfo.max_freq) max_tsc_khz = policy.cpuinfo.max_freq; + put_cpu(); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); @@ -4656,7 +4720,6 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_nonpresent_ptes(0ull, 0ull); - kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -5119,6 +5182,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } + if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { + /* Page is swapped out. Do synthetic halt */ + vcpu->arch.apf.halted = true; + r = 1; + goto out; + } + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); @@ -5147,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_fpu(vcpu); kvm_load_guest_xcr0(vcpu); - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); + vcpu->mode = IN_GUEST_MODE; + + /* We should set ->mode before check ->requests, + * see the comment in make_all_cpus_request. + */ + smp_mb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); @@ -5190,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -5247,7 +5322,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) r = 1; while (r > 0) { - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted) r = vcpu_enter_guest(vcpu); else { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); @@ -5260,6 +5336,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; case KVM_MP_STATE_RUNNABLE: + vcpu->arch.apf.halted = false; break; case KVM_MP_STATE_SIPI_RECEIVED: default: @@ -5281,6 +5358,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; } + + kvm_check_async_pf_completion(vcpu); + if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -5305,6 +5385,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; + if (!tsk_used_math(current) && init_fpu(current)) + return -ENOMEM; + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -5316,8 +5399,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) - kvm_set_cr8(vcpu, kvm_run->cr8); + if (!irqchip_in_kernel(vcpu->kvm)) { + if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { + r = -EINVAL; + goto out; + } + } if (vcpu->arch.pio.count || vcpu->mmio_needed) { if (vcpu->mmio_needed) { @@ -5326,7 +5413,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->mmio_needed = 0; } vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r != EMULATE_DONE) { r = 0; @@ -5439,7 +5526,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; - sregs->cr3 = vcpu->arch.cr3; + sregs->cr3 = kvm_read_cr3(vcpu); sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; @@ -5496,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int pending_vec, max_bits; + int pending_vec, max_bits, idx; struct desc_ptr dt; dt.size = sregs->idt.limit; @@ -5507,8 +5594,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; + mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); kvm_set_cr8(vcpu, sregs->cr8); @@ -5522,10 +5610,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (sregs->cr4 & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); @@ -5536,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -5733,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); @@ -5774,6 +5862,8 @@ free_vcpu: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + vcpu->arch.apf.msr_val = 0; + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -5793,6 +5883,13 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr7 = DR7_FIXED_1; kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.apf.msr_val = 0; + + kvmclock_reset(vcpu); + + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + vcpu->arch.apf.halted = false; return kvm_x86_ops->vcpu_reset(vcpu); } @@ -5882,6 +5979,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + kvm_async_pf_hash_reset(vcpu); + return 0; fail_free_mce_banks: kfree(vcpu->arch.mce_banks); @@ -5907,22 +6006,17 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -struct kvm *kvm_arch_create_vm(void) +int kvm_arch_init_vm(struct kvm *kvm) { - struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); - - if (!kvm) - return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + raw_spin_lock_init(&kvm->arch.tsc_write_lock); - return kvm; + return 0; } static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) @@ -5940,8 +6034,10 @@ static void kvm_free_vcpus(struct kvm *kvm) /* * Unpin any mmu pages first. */ - kvm_for_each_vcpu(i, vcpu, kvm) + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); + } kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); @@ -5965,13 +6061,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kvm_free_physmem(kvm); if (kvm->arch.apic_access_page) put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); - cleanup_srcu_struct(&kvm->srcu); - kfree(kvm); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -6019,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; @@ -6034,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, "failed to munmap memory\n"); } + if (!kvm->arch.n_requested_mmu_pages) + nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + spin_lock(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); } @@ -6052,7 +6145,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && + !vcpu->arch.apf.halted) + || !list_empty_careful(&vcpu->async_pf.done) || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED || vcpu->arch.nmi_pending || (kvm_arch_interrupt_allowed(vcpu) && @@ -6071,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) smp_send_reschedule(cpu); put_cpu(); } @@ -6111,6 +6206,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) } EXPORT_SYMBOL_GPL(kvm_set_rflags); +void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) +{ + int r; + + if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || + is_error_page(work->page)) + return; + + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + return; + + if (!vcpu->arch.mmu.direct_map && + work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) + return; + + vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true); +} + +static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) +{ + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); +} + +static inline u32 kvm_async_pf_next_probe(u32 key) +{ + return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); +} + +static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 key = kvm_async_pf_hash_fn(gfn); + + while (vcpu->arch.apf.gfns[key] != ~0) + key = kvm_async_pf_next_probe(key); + + vcpu->arch.apf.gfns[key] = gfn; +} + +static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + int i; + u32 key = kvm_async_pf_hash_fn(gfn); + + for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + (vcpu->arch.apf.gfns[key] != gfn && + vcpu->arch.apf.gfns[key] != ~0); i++) + key = kvm_async_pf_next_probe(key); + + return key; +} + +bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; +} + +static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u32 i, j, k; + + i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + while (true) { + vcpu->arch.apf.gfns[i] = ~0; + do { + j = kvm_async_pf_next_probe(j); + if (vcpu->arch.apf.gfns[j] == ~0) + return; + k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]); + /* + * k lies cyclically in ]i,j] + * | i.k.j | + * |....j i.k.| or |.k..j i...| + */ + } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); + vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; + i = j; + } +} + +static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); +} + +void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + struct x86_exception fault; + + trace_kvm_async_pf_not_present(work->arch.token, work->gva); + kvm_add_async_pf_gfn(vcpu, work->arch.gfn); + + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || + (vcpu->arch.apf.send_user_only && + kvm_x86_ops->get_cpl(vcpu) == 0)) + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + kvm_inject_page_fault(vcpu, &fault); + } +} + +void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, + struct kvm_async_pf *work) +{ + struct x86_exception fault; + + trace_kvm_async_pf_ready(work->arch.token, work->gva); + if (is_error_page(work->page)) + work->arch.token = ~0; /* broadcast wakeup */ + else + kvm_del_async_pf_gfn(vcpu, work->arch.gfn); + + if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && + !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + kvm_inject_page_fault(vcpu, &fault); + } + vcpu->arch.apf.halted = false; +} + +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + return true; + else + return !kvm_event_needs_reinjection(vcpu) && + kvm_x86_ops->interrupt_allowed(vcpu); +} + EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); |