diff options
Diffstat (limited to 'arch/x86/kvm/vmx.c')
-rw-r--r-- | arch/x86/kvm/vmx.c | 300 |
1 files changed, 200 insertions, 100 deletions
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1d26f3c4985b..e665aa7167cf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -121,7 +121,6 @@ module_param_named(pml, enable_pml, bool, S_IRUGO); #define MSR_BITMAP_MODE_X2APIC 1 #define MSR_BITMAP_MODE_X2APIC_APICV 2 -#define MSR_BITMAP_MODE_LM 4 #define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL @@ -397,6 +396,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -856,6 +856,7 @@ struct nested_vmx { /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ u64 vmcs01_debugctl; + u64 vmcs01_guest_bndcfgs; u16 vpid02; u16 last_vpid; @@ -1019,6 +1020,8 @@ struct vcpu_vmx { int ple_window; bool ple_window_dirty; + bool req_immediate_exit; + /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -1569,8 +1572,12 @@ static int vmx_hv_remote_flush_tlb(struct kvm *kvm) goto out; } + /* + * FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs the address of the + * base of EPT PML4 table, strip off EPT configuration information. + */ ret = hyperv_flush_guest_mapping( - to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer & PAGE_MASK); out: spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); @@ -2864,6 +2871,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; + vmx->req_immediate_exit = false; + if (vmx->loaded_cpu_state) return; @@ -2894,8 +2903,7 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #else savesegment(fs, fs_sel); savesegment(gs, gs_sel); @@ -2946,8 +2954,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) vmx->loaded_cpu_state = NULL; #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif if (host_state->ldt_sel || (host_state->gs_sel & 7)) { kvm_load_ldt(host_state->ldt_sel); @@ -2975,24 +2982,19 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - rdmsrl(MSR_KERNEL_GS_BASE, - vmx->msr_guest_kernel_gs_base); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + preempt_enable(); return vmx->msr_guest_kernel_gs_base; } static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) { - if (is_long_mode(&vmx->vcpu)) { - preempt_disable(); - if (vmx->loaded_cpu_state) - wrmsrl(MSR_KERNEL_GS_BASE, data); - preempt_enable(); - } + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); vmx->msr_guest_kernel_gs_base = data; } #endif @@ -3528,9 +3530,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; - if (kvm_mpx_supported()) - msrs->exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; - /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; @@ -3547,8 +3546,6 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) VM_ENTRY_LOAD_IA32_PAT; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); - if (kvm_mpx_supported()) - msrs->entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -3596,12 +3593,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high); msrs->secondary_ctls_low = 0; msrs->secondary_ctls_high &= - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* * We can emulate "VMCS shadowing," even if the hardware * doesn't support it. @@ -3658,6 +3655,10 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) msrs->secondary_ctls_high |= SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (flexpriority_enabled) + msrs->secondary_ctls_high |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + /* miscellaneous data */ rdmsr(MSR_IA32_VMX_MISC, msrs->misc_low, @@ -5068,19 +5069,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (!msr) return; - /* - * MSR_KERNEL_GS_BASE is not intercepted when the guest is in - * 64-bit mode as a 64-bit kernel may frequently access the - * MSR. This means we need to manually save/restore the MSR - * when switching between guest and host state, but only if - * the guest is in 64-bit mode. Sync our cached value if the - * guest is transitioning to 32-bit mode and the CPU contains - * guest state, i.e. the cache is stale. - */ -#ifdef CONFIG_X86_64 - if (!(efer & EFER_LMA)) - (void)vmx_read_guest_kernel_gs_base(vmx); -#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); @@ -5393,9 +5381,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * To use VMXON (and later other VMX instructions), a guest * must first be able to turn on cr4.VMXE (see handle_vmon()). * So basically the check on whether to allow nested VMX - * is here. + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. */ - if (!nested_vmx_allowed(vcpu)) + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) return 1; } @@ -6072,9 +6061,6 @@ static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) mode |= MSR_BITMAP_MODE_X2APIC_APICV; } - if (is_long_mode(vcpu)) - mode |= MSR_BITMAP_MODE_LM; - return mode; } @@ -6115,9 +6101,6 @@ static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) if (!changed) return; - vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW, - !(mode & MSR_BITMAP_MODE_LM)); - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) vmx_update_msr_bitmap_x2apic(msr_bitmap, mode); @@ -6183,6 +6166,32 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static u8 vmx_get_rvi(void) +{ + return vmcs_read16(GUEST_INTR_STATUS) & 0xff; +} + +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmx_get_rvi(); + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -6983,7 +6992,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) { if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; return kvm_vcpu_halt(vcpu); @@ -7054,7 +7063,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) { WARN_ON_ONCE(!enable_vmware_backdoor); - er = emulate_instruction(vcpu, + er = kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL); if (er == EMULATE_USER_EXIT) return 0; @@ -7157,7 +7166,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string) - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -7231,7 +7240,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_cr(struct kvm_vcpu *vcpu) @@ -7480,7 +7489,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu) static int handle_invd(struct kvm_vcpu *vcpu) { - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_invlpg(struct kvm_vcpu *vcpu) @@ -7547,7 +7556,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } } - return emulate_instruction(vcpu, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE; } static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) @@ -7704,8 +7713,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return kvm_skip_emulated_instruction(vcpu); else - return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, - NULL, 0) == EMULATE_DONE; + return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) == + EMULATE_DONE; } return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); @@ -7748,7 +7757,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; - err = emulate_instruction(vcpu, 0); + err = kvm_emulate_instruction(vcpu, 0); if (err == EMULATE_USER_EXIT) { ++vcpu->stat.mmio_exits; @@ -7966,6 +7975,9 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { u64 vmx_msr; @@ -9208,7 +9220,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - kvm_lapic_expired_hv_timer(vcpu); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -10214,15 +10227,16 @@ static void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + if (!flexpriority_enabled && + !cpu_has_vmx_virtualize_x2apic_mode()) + return; + /* Postpone execution until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.change_vmcs01_virtual_apic_mode = true; return; } - if (!cpu_need_tpr_shadow(vcpu)) - return; - sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); @@ -10344,6 +10358,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) return max_irr; } +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) +{ + u8 rvi = vmx_get_rvi(); + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) { if (!kvm_vcpu_apicv_active(vcpu)) @@ -10595,24 +10617,43 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->hv_deadline_tsc == -1) + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); return; + } - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + vmx_arm_hv_timer(vmx, delta_tsc); + return; + } + + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -10672,7 +10713,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_arm_hv_timer(vcpu); + vmx_update_hv_timer(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -11214,6 +11255,23 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) #undef cr4_fixed1_update } +static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (kvm_mpx_supported()) { + bool mpx_enabled = guest_cpuid_has(vcpu, X86_FEATURE_MPX); + + if (mpx_enabled) { + vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + } else { + vmx->nested.msrs.entry_ctls_high &= ~VM_ENTRY_LOAD_BNDCFGS; + vmx->nested.msrs.exit_ctls_high &= ~VM_EXIT_CLEAR_BNDCFGS; + } + } +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -11230,8 +11288,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - if (nested_vmx_allowed(vcpu)) + if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); + nested_vmx_entry_exit_ctls_update(vcpu); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -11427,16 +11487,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } + if (vcpu->arch.virtual_tsc_khz == 0) + return; + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); @@ -11646,11 +11708,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (!nested_cpu_has_vid(vmcs12) || !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ @@ -11993,8 +12059,13 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - if (vmx_mpx_supported()) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + if (kvm_mpx_supported()) { + if (vmx->nested.nested_run_pending && + (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + else + vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + } if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) @@ -12076,11 +12147,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -12318,6 +12388,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; @@ -12537,12 +12610,21 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); bool from_vmentry = !!exit_qual; u32 dummy_exit_qual; + bool evaluate_pending_interrupts; int r = 0; + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & + (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING); + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); + enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + if (kvm_mpx_supported() && + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) + vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); @@ -12575,6 +12657,23 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) } /* + * If L1 had a pending IRQ/NMI until it executed + * VMLAUNCH/VMRESUME which wasn't delivered because it was + * disallowed (e.g. interrupts disabled), L0 needs to + * evaluate if this pending event should cause an exit from L2 + * to L1 or delivered directly to L2 (e.g. In case L1 don't + * intercept EXTERNAL_INTERRUPT). + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request, or checking RVI during evaluation of + * pending virtual interrupts. However, this setting was done + * on VMCS01 and now VMCS02 is active instead. Thus, we force L0 + * to perform pending event evaluation by requesting a KVM_REQ_EVENT. + */ + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet * returned as far as L1 is concerned. It will only return (and set @@ -12841,6 +12940,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -13231,12 +13335,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -13440,18 +13539,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif @@ -13932,6 +14025,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; @@ -13988,9 +14089,6 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; - if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) - vmx->nested.nested_run_pending = 1; - vmx->nested.dirty_vmcs12 = true; ret = enter_vmx_non_root_mode(vcpu, NULL); if (ret) @@ -14078,6 +14176,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, @@ -14111,6 +14210,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, |