From cf81a7e580ac0d598d3e7e9c06864168b2b4073d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jul 2018 09:54:30 -0700 Subject: KVM: vmx: remove save/restore of host BNDCGFS MSR Linux does not support Memory Protection Extensions (MPX) in the kernel itself, thus the BNDCFGS (Bound Config Supervisor) MSR will always be zero in the KVM host, i.e. RDMSR in vmx_save_host_state() is superfluous. KVM unconditionally sets VM_EXIT_CLEAR_BNDCFGS, i.e. BNDCFGS will always be zero after VMEXIT, thus manually loading BNDCFGS is also superfluous. And in the event the MPX kernel support is added (unlikely given that MPX for userspace is in its death throes[1]), BNDCFGS will likely be common across all CPUs[2], and at the least shouldn't change on a regular basis, i.e. saving the MSR on every VMENTRY is completely unnecessary. WARN_ONCE in hardware_setup() if the host's BNDCFGS is non-zero to document that KVM does not preserve BNDCFGS and to serve as a hint as to how BNDCFGS likely should be handled if MPX is used in the kernel, e.g. BNDCFGS should be saved once during KVM setup. [1] https://lkml.org/lkml/2018/4/27/1046 [2] http://www.openwall.com/lists/kernel-hardening/2017/07/24/28 Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e30da9a2430c..6318167b1464 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -802,7 +802,6 @@ struct vcpu_vmx { #endif int gs_ldt_reload_needed; int fs_reload_needed; - u64 msr_host_bndcfgs; } host_state; struct { int vm86_active; @@ -2630,8 +2629,6 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); #endif - if (boot_cpu_has(X86_FEATURE_MPX)) - rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, @@ -2669,8 +2666,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (vmx->host_state.msr_host_bndcfgs) - wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); load_fixmap_gdt(raw_smp_processor_id()); } @@ -7502,6 +7497,7 @@ static void vmx_enable_tdp(void) static __init int hardware_setup(void) { + unsigned long host_bndcfgs; int r = -ENOMEM, i; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7526,6 +7522,11 @@ static __init int hardware_setup(void) if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); + if (boot_cpu_has(X86_FEATURE_MPX)) { + rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs); + WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); + } + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; -- cgit v1.2.1 From 44883f01fe6ae436a8604c47d8435276fef369b0 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 26 Jul 2018 13:01:52 +0200 Subject: KVM: x86: ensure all MSRs can always be KVM_GET/SET_MSR'd Some of the MSRs returned by GET_MSR_INDEX_LIST currently cannot be sent back to KVM_GET_MSR and/or KVM_SET_MSR; either they can never be sent back, or you they are only accepted under special conditions. This makes the API a pain to use. To avoid this pain, this patch makes it so that the result of the get-list ioctl can always be used for host-initiated get and set. Since we don't have a separate way to check for read-only MSRs, this means some Hyper-V MSRs are ignored when written. Arguably they should not even be in the result of GET_MSR_INDEX_LIST, but I am leaving there in case userspace is using the outcome of GET_MSR_INDEX_LIST to derive the support for the corresponding Hyper-V feature. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 27 ++++++++++++++++++++------- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/x86.c | 15 +++++++++------ 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index af8caf965baa..01d209ab5481 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, struct kvm_vcpu *vcpu = synic_to_vcpu(synic); int ret; - if (!synic->active) + if (!synic->active && !host) return 1; trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host); @@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } -static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) +static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, + bool host) { int ret; - if (!synic->active) + if (!synic->active && !host) return 1; ret = 0; @@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, case HV_X64_MSR_TSC_EMULATION_STATUS: hv->hv_tsc_emulation_status = data; break; + case HV_X64_MSR_TIME_REF_COUNT: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return stimer_set_count(vcpu_to_stimer(vcpu, timer_index), data, host); } + case HV_X64_MSR_TSC_FREQUENCY: + case HV_X64_MSR_APIC_FREQUENCY: + /* read-only, but still ignore it if host-initiated */ + if (!host) + return 1; + break; default: vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; @@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_SIMP: case HV_X64_MSR_EOM: case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15: - return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata); + return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host); case HV_X64_MSR_STIMER0_CONFIG: case HV_X64_MSR_STIMER1_CONFIG: case HV_X64_MSR_STIMER2_CONFIG: @@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return kvm_hv_set_msr(vcpu, msr, data, host); } -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else - return kvm_hv_get_msr(vcpu, msr, pdata); + return kvm_hv_get_msr(vcpu, msr, pdata, host); } static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no) diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 837465d69c6d..d6aa969e20f1 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) } int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); -int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); bool kvm_hv_hypercall_enabled(struct kvm *kvm); int kvm_hv_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b812b3c5088..6f0fabdb2109 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2160,10 +2160,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && + (data || !msr_info->host_initiated)) return 1; if (data != 0 && data != ~(u64)0) - return -1; + return 1; vcpu->arch.mcg_ctl = data; break; default: @@ -2551,7 +2552,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) } EXPORT_SYMBOL_GPL(kvm_get_msr); -static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2566,7 +2567,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.mcg_cap; break; case MSR_IA32_MCG_CTL: - if (!(mcg_cap & MCG_CTL_P)) + if (!(mcg_cap & MCG_CTL_P) && !host) return 1; data = vcpu->arch.mcg_ctl; break; @@ -2699,7 +2700,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr_info->index, &msr_info->data); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data, + msr_info->host_initiated); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2720,7 +2722,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: return kvm_hv_get_msr_common(vcpu, - msr_info->index, &msr_info->data); + msr_info->index, &msr_info->data, + msr_info->host_initiated); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current -- cgit v1.2.1 From 7f7f1ba33cf2c21d001821313088c231db42ff40 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Jul 2018 18:49:01 +0200 Subject: KVM: x86: do not load vmcs12 pages while still in SMM If the vCPU enters system management mode while running a nested guest, RSM starts processing the vmentry while still in SMM. In that case, however, the pages pointed to by the vmcs12 might be incorrectly loaded from SMRAM. To avoid this, delay the handling of the pages until just before the next vmentry. This is done with a new request and a new entry in kvm_x86_ops, which we will be able to reuse for nested VMX state migration. Extracted from a patch by Jim Mattson and KarimAllah Ahmed. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++-------------- arch/x86/kvm/x86.c | 2 ++ 3 files changed, 41 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..da957725992d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -75,6 +75,7 @@ #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) +#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1085,6 +1086,8 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6318167b1464..fee44e4c5c79 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10660,9 +10660,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); struct page *page; u64 hpa; @@ -11774,12 +11774,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, return 0; } -static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) +/* + * If exit_qual is NULL, this is being called from RSM. + * Otherwise it's called from vmlaunch/vmresume. + */ +static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 exit_qual; - int r; + bool from_vmentry = !!exit_qual; + u32 dummy_exit_qual; + int r = 0; enter_guest_mode(vcpu); @@ -11793,17 +11798,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu) vcpu->arch.tsc_offset += vmcs12->tsc_offset; r = EXIT_REASON_INVALID_STATE; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual)) goto fail; - nested_get_vmcs12_pages(vcpu, vmcs12); + if (from_vmentry) { + nested_get_vmcs12_pages(vcpu); - r = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) - goto fail; + r = EXIT_REASON_MSR_LOAD_FAIL; + *exit_qual = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (*exit_qual) + goto fail; + } else { + /* + * The MMU is not initialized to point at the right entities yet and + * "get pages" would need to read data from the guest (i.e. we will + * need to perform gpa to hpa translation). Request a call + * to nested_get_vmcs12_pages before the next VM-entry. The MSRs + * have already been set at vmentry time and should not be reset. + */ + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -11818,8 +11834,7 @@ fail: vcpu->arch.tsc_offset -= vmcs12->tsc_offset; leave_guest_mode(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); - nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual); - return 1; + return r; } /* @@ -11896,10 +11911,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) */ vmx->nested.nested_run_pending = 1; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, &exit_qual); if (ret) { + nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual); vmx->nested.nested_run_pending = 0; - return ret; + return 1; } /* @@ -12985,7 +13001,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) if (vmx->nested.smm.guest_mode) { vcpu->arch.hflags &= ~HF_SMM_MASK; - ret = enter_vmx_non_root_mode(vcpu); + ret = enter_vmx_non_root_mode(vcpu, NULL); vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; @@ -13134,6 +13150,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .smi_allowed = vmx_smi_allowed, .pre_enter_smm = vmx_pre_enter_smm, .pre_leave_smm = vmx_pre_leave_smm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f0fabdb2109..fbd59ad047b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7260,6 +7260,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) + kvm_x86_ops->get_vmcs12_pages(vcpu); if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) -- cgit v1.2.1 From 8fcc4b5923af5de58b80b53a069453b135693304 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 10 Jul 2018 11:27:20 +0200 Subject: kvm: nVMX: Introduce KVM_CAP_NESTED_STATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For nested virtualization L0 KVM is managing a bit of state for L2 guests, this state can not be captured through the currently available IOCTLs. In fact the state captured through all of these IOCTLs is usually a mix of L1 and L2 state. It is also dependent on whether the L2 guest was running at the moment when the process was interrupted to save its state. With this capability, there are two new vcpu ioctls: KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE. These can be used for saving and restoring a VM that is in VMX operation. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Jim Mattson [karahmed@ - rename structs and functions and make them ready for AMD and address previous comments. - handle nested.smm state. - rebase & a bit of refactoring. - Merge 7/8 and 8/8 into one patch. ] Signed-off-by: KarimAllah Ahmed Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++ arch/x86/include/uapi/asm/kvm.h | 37 +++++++++ arch/x86/kvm/vmx.c | 175 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 54 +++++++++++++ 4 files changed, 270 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da957725992d..bd287b348751 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1086,6 +1086,12 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); + int (*get_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_nested_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index c535c2fdea13..86299efa804a 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -378,4 +378,41 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_STATE_NESTED_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_RUN_PENDING 0x00000002 + +#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 +#define KVM_STATE_NESTED_SMM_VMXON 0x00000002 + +struct kvm_vmx_nested_state { + __u64 vmxon_pa; + __u64 vmcs_pa; + + struct { + __u16 flags; + } smm; +}; + +/* for KVM_CAP_NESTED_STATE */ +struct kvm_nested_state { + /* KVM_STATE_* flags */ + __u16 flags; + + /* 0 for VMX, 1 for SVM. */ + __u16 format; + + /* 128 for SVM, 128 + VMCS size for VMX. */ + __u32 size; + + union { + /* VMXON, VMCS */ + struct kvm_vmx_nested_state vmx; + + /* Pad the header to 128 bytes. */ + __u8 pad[120]; + }; + + __u8 data[0]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fee44e4c5c79..4be6486173b7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7589,6 +7589,11 @@ static __init int hardware_setup(void) else kvm_disable_tdp(); + if (!nested) { + kvm_x86_ops->get_nested_state = NULL; + kvm_x86_ops->set_nested_state = NULL; + } + /* * Only enable PML when hardware supports PML feature, and both EPT * and EPT A/D bit features are enabled -- PML depends on them to work. @@ -11775,8 +11780,8 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * If exit_qual is NULL, this is being called from RSM. - * Otherwise it's called from vmlaunch/vmresume. + * If exit_qual is NULL, this is being called from state restore (either RSM + * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. */ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual) { @@ -13016,6 +13021,170 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int vmx_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_vmx *vmx; + struct vmcs12 *vmcs12; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = 0, + .size = sizeof(kvm_state), + .vmx.vmxon_pa = -1ull, + .vmx.vmcs_pa = -1ull, + }; + + if (!vcpu) + return kvm_state.size + 2 * VMCS12_SIZE; + + vmx = to_vmx(vcpu); + vmcs12 = get_vmcs12(vcpu); + if (nested_vmx_allowed(vcpu) && + (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { + kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + + if (vmx->nested.current_vmptr != -1ull) + kvm_state.size += VMCS12_SIZE; + + if (vmx->nested.smm.vmxon) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + + if (vmx->nested.smm.guest_mode) + kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + + if (is_guest_mode(vcpu)) { + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (vmx->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + } + + if (user_data_size < kvm_state.size) + goto out; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (vmx->nested.current_vmptr == -1ull) + goto out; + + /* + * When running L2, the authoritative vmcs12 state is in the + * vmcs02. When running L1, the authoritative vmcs12 state is + * in the shadow vmcs linked to vmcs01, unless + * sync_shadow_vmcs is set, in which case, the authoritative + * vmcs12 state is in the vmcs12 already. + */ + if (is_guest_mode(vcpu)) + sync_vmcs12(vcpu, vmcs12); + else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs) + copy_shadow_to_vmcs12(vmx); + + if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) + return -EFAULT; + +out: + return kvm_state.size; +} + +static int vmx_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12; + u32 exit_qual; + int ret; + + if (kvm_state->format != 0) + return -EINVAL; + + if (!nested_vmx_allowed(vcpu)) + return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + + if (kvm_state->vmx.vmxon_pa == -1ull) { + if (kvm_state->vmx.smm.flags) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa != -1ull) + return -EINVAL; + + vmx_leave_nested(vcpu); + return 0; + } + + if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) + return -EINVAL; + + if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + return -EINVAL; + + if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (kvm_state->vmx.smm.flags & + ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + return -EINVAL; + + vmx_leave_nested(vcpu); + if (kvm_state->vmx.vmxon_pa == -1ull) + return 0; + + vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + ret = enter_vmx_operation(vcpu); + if (ret) + return ret; + + set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + vmx->nested.smm.vmxon = true; + vmx->nested.vmxon = false; + + if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + vmx->nested.smm.guest_mode = true; + } + + vmcs12 = get_vmcs12(vcpu); + if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + return -EFAULT; + + if (vmcs12->revision_id != VMCS12_REVISION) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return 0; + + vmx->nested.nested_run_pending = + !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + + if (check_vmentry_prereqs(vcpu, vmcs12) || + check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) + return -EINVAL; + + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vmx->nested.nested_run_pending = 1; + + vmx->nested.dirty_vmcs12 = true; + ret = enter_vmx_non_root_mode(vcpu, NULL); + if (ret) + return -EINVAL; + + return 0; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -13150,6 +13319,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .setup_mce = vmx_setup_mce, + .get_nested_state = vmx_get_nested_state, + .set_nested_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, .smi_allowed = vmx_smi_allowed, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fbd59ad047b0..1b14c4a654c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2947,6 +2947,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; + case KVM_CAP_NESTED_STATE: + r = kvm_x86_ops->get_nested_state ? + kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + break; default: break; } @@ -3963,6 +3967,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + case KVM_GET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + u32 user_data_size; + + r = -EINVAL; + if (!kvm_x86_ops->get_nested_state) + break; + + BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + if (get_user(user_data_size, &user_kvm_nested_state->size)) + return -EFAULT; + + r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + user_data_size); + if (r < 0) + return r; + + if (r > user_data_size) { + if (put_user(r, &user_kvm_nested_state->size)) + return -EFAULT; + return -E2BIG; + } + r = 0; + break; + } + case KVM_SET_NESTED_STATE: { + struct kvm_nested_state __user *user_kvm_nested_state = argp; + struct kvm_nested_state kvm_state; + + r = -EINVAL; + if (!kvm_x86_ops->set_nested_state) + break; + + if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) + return -EFAULT; + + if (kvm_state.size < sizeof(kvm_state)) + return -EINVAL; + + if (kvm_state.flags & + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + /* nested_run_pending implies guest_mode. */ + if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + return -EINVAL; + + r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + break; + } default: r = -EINVAL; } -- cgit v1.2.1 From 392b2f25aa415c0222b348f95875409be49a1201 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:01 +0300 Subject: KVM: VMX: Create struct for VMCS header No functionality change. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4be6486173b7..37944b79da45 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -198,8 +198,13 @@ struct kvm_vmx { #define NR_AUTOLOAD_MSRS 8 +struct vmcs_hdr { + u32 revision_id:31; + u32 shadow_vmcs:1; +}; + struct vmcs { - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; char data[0]; }; @@ -253,7 +258,7 @@ struct __packed vmcs12 { /* According to the Intel spec, a VMCS region must start with the * following two fields. Then follow implementation-specific data. */ - u32 revision_id; + struct vmcs_hdr hdr; u32 abort; u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ @@ -421,7 +426,7 @@ struct __packed vmcs12 { "Offset of " #field " in struct vmcs12 has changed.") static inline void vmx_check_vmcs12_offsets(void) { - CHECK_OFFSET(revision_id, 0); + CHECK_OFFSET(hdr, 0); CHECK_OFFSET(abort, 4); CHECK_OFFSET(launch_state, 8); CHECK_OFFSET(io_bitmap_a, 40); @@ -4399,9 +4404,9 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) /* KVM supports Enlightened VMCS v1 only */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = KVM_EVMCS_VERSION; + vmcs->hdr.revision_id = KVM_EVMCS_VERSION; else - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; return vmcs; } @@ -4581,7 +4586,7 @@ static __init int alloc_kvm_area(void) * physical CPU. */ if (static_branch_unlikely(&enable_evmcs)) - vmcs->revision_id = vmcs_config.revision_id; + vmcs->hdr.revision_id = vmcs_config.revision_id; per_cpu(vmxarea, cpu) = vmcs; } @@ -7889,7 +7894,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!shadow_vmcs) goto out_shadow_vmcs; /* mark vmcs as shadow */ - shadow_vmcs->revision_id |= (1u << 31); + shadow_vmcs->hdr.shadow_vmcs = 1; /* init shadow vmcs */ vmcs_clear(shadow_vmcs); vmx->vmcs01.shadow_vmcs = shadow_vmcs; @@ -8459,7 +8464,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); - if (new_vmcs12->revision_id != VMCS12_REVISION) { + if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || + new_vmcs12->hdr.shadow_vmcs) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, @@ -13161,7 +13167,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) return -EFAULT; - if (vmcs12->revision_id != VMCS12_REVISION) + if (vmcs12->hdr.revision_id != VMCS12_REVISION) return -EINVAL; if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) -- cgit v1.2.1 From e253674227328575084cf7454d22749b9be11d52 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:02 +0300 Subject: KVM: VMX: Change vmcs12_{read,write}_any() to receive vmcs12 as parameter No functionality change. This is done as a preparation for VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 37944b79da45..82b01b34a3a2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8159,7 +8159,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of * 64-bit fields are to be returned). */ -static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u64 *ret) { short offset = vmcs_field_to_offset(field); @@ -8168,7 +8168,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - p = ((char *)(get_vmcs12(vcpu))) + offset; + p = (char *)vmcs12 + offset; switch (vmcs_field_width(field)) { case VMCS_FIELD_WIDTH_NATURAL_WIDTH: @@ -8190,10 +8190,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, } -static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, +static inline int vmcs12_write_any(struct vmcs12 *vmcs12, unsigned long field, u64 field_value){ short offset = vmcs_field_to_offset(field); - char *p = ((char *) get_vmcs12(vcpu)) + offset; + char *p = (char *)vmcs12 + offset; if (offset < 0) return offset; @@ -8246,7 +8246,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; field_value = __vmcs_readl(field); - vmcs12_write_any(&vmx->vcpu, field, field_value); + vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value); } /* * Skip the VM-exit information fields if they are read-only. @@ -8281,7 +8281,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (q = 0; q < ARRAY_SIZE(fields); q++) { for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; - vmcs12_read_any(&vmx->vcpu, field, &field_value); + vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value); __vmcs_writel(field, field_value); } } @@ -8321,7 +8321,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(vcpu, field, &field_value) < 0) { + if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8397,7 +8397,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(vcpu, field, field_value) < 0) { + if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -10971,11 +10971,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, unsigned long count_field, unsigned long addr_field) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); int maxphyaddr; u64 count, addr; - if (vmcs12_read_any(vcpu, count_field, &count) || - vmcs12_read_any(vcpu, addr_field, &addr)) { + if (vmcs12_read_any(vmcs12, count_field, &count) || + vmcs12_read_any(vmcs12, addr_field, &addr)) { WARN_ON(1); return -EINVAL; } -- cgit v1.2.1 From fa97d7dba7538adf174c3984e726c0537ab553a4 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 18 Jul 2018 14:07:59 +0200 Subject: KVM: nVMX: Allow VMPTRLD for shadow VMCS if vCPU supports VMCS shadowing Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 82b01b34a3a2..44e2b82f6519 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1720,6 +1720,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) CPU_BASED_MONITOR_TRAP_FLAG; } +static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.secondary_ctls_high & + SECONDARY_EXEC_SHADOW_VMCS; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -8465,7 +8471,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } new_vmcs12 = kmap(page); if (new_vmcs12->hdr.revision_id != VMCS12_REVISION || - new_vmcs12->hdr.shadow_vmcs) { + (new_vmcs12->hdr.shadow_vmcs && + !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kunmap(page); kvm_release_page_clean(page); nested_vmx_failValid(vcpu, -- cgit v1.2.1 From a6192d40d52f6b86997a71449e2ebc3d7f5ca103 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:04 +0300 Subject: KVM: nVMX: Fail VMLAUNCH and VMRESUME on shadow VMCS Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 44e2b82f6519..f6ec898ac539 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11876,6 +11876,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmcs12 = get_vmcs12(vcpu); + /* + * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact + * that there *is* a valid VMCS pointer, RFLAGS.CF is set + * rather than RFLAGS.ZF, and no error number is stored to the + * VM-instruction error field. + */ + if (vmcs12->hdr.shadow_vmcs) { + nested_vmx_failInvalid(vcpu); + goto out; + } + if (enable_shadow_vmcs) copy_shadow_to_vmcs12(vmx); -- cgit v1.2.1 From f792d2743ed4db9e96eff43bdc1e15dd441f19bf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:05 +0300 Subject: KVM: nVMX: Introduce nested_cpu_has_shadow_vmcs() Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f6ec898ac539..64b11c57b5f4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1806,6 +1806,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12) VMX_VMFUNC_EPTP_SWITCHING); } +static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12) +{ + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS); +} + static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) @@ -11745,7 +11750,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) && + if (!nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; -- cgit v1.2.1 From a8a7c02bf7b70cda6face6321a45de56519c24bf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:06 +0300 Subject: KVM: nVMX: Verify VMCS shadowing controls Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 64b11c57b5f4..e762222476a9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11038,6 +11038,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, return 0; } +static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) || + !page_address_valid(vcpu, vmcs12->vmwrite_bitmap)) + return -EINVAL; + + return 0; +} + static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { @@ -11639,6 +11652,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_vmx_check_pml_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.msrs.procbased_ctls_low, vmx->nested.msrs.procbased_ctls_high) || -- cgit v1.2.1 From f145d90d97bab0e11b78da1739e5db742575037c Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:07 +0300 Subject: KVM: nVMX: Verify VMCS shadowing VMCS link pointer Intel SDM considers these checks to be part of "Checks on Guest Non-Register State". Note that it is legal for vmcs->vmcs_link_pointer to be -1ull when VMCS shadowing is enabled. In this case, any VMREAD/VMWRITE to shadowed-field sets the ALU flags for VMfailInvalid (i.e. CF=1). Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e762222476a9..f8ad42bcfc2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11755,6 +11755,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) return 0; } +static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + int r; + struct page *page; + struct vmcs12 *shadow; + + if (vmcs12->vmcs_link_pointer == -1ull) + return 0; + + if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)) + return -EINVAL; + + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + if (is_error_page(page)) + return -EINVAL; + + r = 0; + shadow = kmap(page); + if (shadow->hdr.revision_id != VMCS12_REVISION || + shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)) + r = -EINVAL; + kunmap(page); + kvm_release_page_clean(page); + return r; +} + static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, u32 *exit_qual) { @@ -11766,8 +11793,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) return 1; - if (!nested_cpu_has_shadow_vmcs(vmcs12) && - vmcs12->vmcs_link_pointer != -1ull) { + if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; return 1; } -- cgit v1.2.1 From 61ada7488ffdef0c0f83da14a12629870abb9e97 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:08 +0300 Subject: KVM: nVMX: Cache shadow vmcs12 on VMEntry and flush to memory on VMExit This is done is done as a preparation to VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8ad42bcfc2d..2a005f519032 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -644,6 +644,12 @@ struct nested_vmx { * memory during VMCLEAR and VMPTRLD. */ struct vmcs12 *cached_vmcs12; + /* + * Cache of the guest's shadow VMCS, existing outside of guest + * memory. Loaded from guest memory during VM entry. Flushed + * to guest memory during VM exit. + */ + struct vmcs12 *cached_shadow_vmcs12; /* * Indicates if the shadow vmcs must be updated with the * data hold by vmcs12 @@ -1076,6 +1082,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.cached_vmcs12; } +static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.cached_shadow_vmcs12; +} + static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu); static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); @@ -7900,6 +7911,10 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_shadow_vmcs12) + goto out_cached_shadow_vmcs12; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); if (!shadow_vmcs) @@ -7919,6 +7934,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) return 0; out_shadow_vmcs: + kfree(vmx->nested.cached_shadow_vmcs12); + +out_cached_shadow_vmcs12: kfree(vmx->nested.cached_vmcs12); out_cached_vmcs12: @@ -8085,6 +8103,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + kfree(vmx->nested.cached_shadow_vmcs12); /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); @@ -10926,6 +10945,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } +static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vmcs12 *shadow; + struct page *page; + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + shadow = get_shadow_vmcs12(vcpu); + page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer); + + memcpy(shadow, kmap(page), VMCS12_SIZE); + + kunmap(page); + kvm_release_page_clean(page); +} + +static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!nested_cpu_has_shadow_vmcs(vmcs12) || + vmcs12->vmcs_link_pointer == -1ull) + return; + + kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer, + get_shadow_vmcs12(vcpu), VMCS12_SIZE); +} + static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -11995,6 +12046,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; } + /* + * Must happen outside of enter_vmx_non_root_mode() as it will + * also be used as part of restoring nVMX state for + * snapshot restore (migration). + * + * In this flow, it is assumed that vmcs12 cache was + * trasferred as part of captured nVMX state and should + * therefore not be read from guest memory (which may not + * exist on destination host yet). + */ + nested_cache_shadow_vmcs12(vcpu, vmcs12); + /* * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken * by event injection, halt vcpu. @@ -12504,6 +12567,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, exit_qualification); + /* + * Must happen outside of sync_vmcs12() as it will + * also be used to capture vmcs12 cache as part of + * capturing nVMX state for snapshot (migration). + * + * Otherwise, this flush will dirty guest memory at a + * point it is already assumed by user-space to be + * immutable. + */ + nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); -- cgit v1.2.1 From fa58a9fa74979f845fc6c94a58501e67f3abb6de Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 18 Jul 2018 19:45:51 +0200 Subject: KVM: nVMX: include shadow vmcs12 in nested state The shadow vmcs12 cannot be flushed on KVM_GET_NESTED_STATE, because at that point guest memory is assumed by userspace to be immutable. Capture the cache in vmx_get_nested_state, adding another page at the end if there is an active shadow vmcs12. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a005f519032..c7d08a54ab8d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -13191,9 +13191,15 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; - if (vmx->nested.current_vmptr != -1ull) + if (vmx->nested.current_vmptr != -1ull) { kvm_state.size += VMCS12_SIZE; + if (is_guest_mode(vcpu) && + nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) + kvm_state.size += VMCS12_SIZE; + } + if (vmx->nested.smm.vmxon) kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; @@ -13232,6 +13238,13 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12))) return -EFAULT; + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + get_shadow_vmcs12(vcpu), sizeof(*vmcs12))) + return -EFAULT; + } + out: return kvm_state.size; } @@ -13316,6 +13329,22 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmx->nested.nested_run_pending = !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (nested_cpu_has_shadow_vmcs(vmcs12) && + vmcs12->vmcs_link_pointer != -1ull) { + struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); + if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + return -EINVAL; + + if (copy_from_user(shadow_vmcs12, + user_kvm_nested_state->data + VMCS12_SIZE, + sizeof(*vmcs12))) + return -EFAULT; + + if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || + !shadow_vmcs12->hdr.shadow_vmcs) + return -EINVAL; + } + if (check_vmentry_prereqs(vcpu, vmcs12) || check_vmentry_postreqs(vcpu, vmcs12, &exit_qual)) return -EINVAL; -- cgit v1.2.1 From 6d894f498f5d121b57a231315b0b459e185abed1 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:09 +0300 Subject: KVM: nVMX: vmread/vmwrite: Use shadow vmcs12 if running L2 This is done as a preparation to VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 61 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c7d08a54ab8d..f91a1599738e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8341,6 +8341,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8348,10 +8349,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (!nested_vmx_check_vmcs12(vcpu)) return kvm_skip_emulated_instruction(vcpu); + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMREAD + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + } + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ - if (vmcs12_read_any(get_vmcs12(vcpu), field, &field_value) < 0) { + if (vmcs12_read_any(vmcs12, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } @@ -8393,6 +8408,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ u64 field_value = 0; struct x86_exception e; + struct vmcs12 *vmcs12; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -8427,23 +8443,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - if (vmcs12_write_any(get_vmcs12(vcpu), field, field_value) < 0) { + if (!is_guest_mode(vcpu)) + vmcs12 = get_vmcs12(vcpu); + else { + /* + * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE + * to shadowed-field sets the ALU flags for VMfailInvalid. + */ + if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) { + nested_vmx_failInvalid(vcpu); + return kvm_skip_emulated_instruction(vcpu); + } + vmcs12 = get_shadow_vmcs12(vcpu); + + } + + if (vmcs12_write_any(vmcs12, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); return kvm_skip_emulated_instruction(vcpu); } - switch (field) { + /* + * Do not track vmcs12 dirty-state if in guest-mode + * as we actually dirty shadow vmcs12 instead of vmcs12. + */ + if (!is_guest_mode(vcpu)) { + switch (field) { #define SHADOW_FIELD_RW(x) case x: #include "vmx_shadow_fields.h" - /* - * The fields that can be updated by L1 without a vmexit are - * always updated in the vmcs02, the others go down the slow - * path of prepare_vmcs02. - */ - break; - default: - vmx->nested.dirty_vmcs12 = true; - break; + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } } nested_vmx_succeed(vcpu); -- cgit v1.2.1 From a7cde481b6e8506ee147cf5031ad1917fcc8bf9b Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:10 +0300 Subject: KVM: nVMX: Do not forward VMREAD/VMWRITE VMExits to L1 if required so by vmcs12 vmread/vmwrite bitmaps This is done as a preparation for VMCS shadowing emulation. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f91a1599738e..27fd3e1653af 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9107,6 +9107,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, return false; } +static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12, gpa_t bitmap) +{ + u32 vmx_instruction_info; + unsigned long field; + u8 b; + + if (!nested_cpu_has_shadow_vmcs(vmcs12)) + return true; + + /* Decode instruction info and find the field to access */ + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + + /* Out-of-range fields always cause a VM exit from L2 to L1 */ + if (field >> 15) + return true; + + if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) + return true; + + return 1 & (b >> (field & 7)); +} + /* * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we * should handle it ourselves in L0 (and then continue L2). Only call this @@ -9191,10 +9215,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); + case EXIT_REASON_VMREAD: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmread_bitmap); + case EXIT_REASON_VMWRITE: + return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, + vmcs12->vmwrite_bitmap); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: - case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: - case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* -- cgit v1.2.1 From 32c7acf044873c8782be45a03e46cf0ac579a459 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:11 +0300 Subject: KVM: nVMX: Expose VMCS shadowing to L1 guest Expose VMCS shadowing to L1 as a VMX capability of the virtual CPU, whether or not VMCS shadowing is supported by the physical CPU. (VMCS shadowing emulation) Shadowed VMREADs and VMWRITEs from L2 are handled by L0, without a VM-exit to L1. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 27fd3e1653af..7cf64cbc3afd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3308,6 +3308,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING; + /* + * We can emulate "VMCS shadowing," even if the hardware + * doesn't support it. + */ + msrs->secondary_ctls_high |= + SECONDARY_EXEC_SHADOW_VMCS; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -11550,6 +11556,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } + /* VMCS shadowing for L2 is emulated for now */ + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); -- cgit v1.2.1 From 491a6038458fc07ec307447dd5e6722e34d67c04 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:12 +0300 Subject: KVM: VMX: Mark vmcs header as shadow in case alloc_vmcs_cpu() allocate shadow vmcs No functionality change. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7cf64cbc3afd..098926a07cf7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4418,7 +4418,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return 0; } -static struct vmcs *alloc_vmcs_cpu(int cpu) +static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu) { int node = cpu_to_node(cpu); struct page *pages; @@ -4436,6 +4436,8 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) else vmcs->hdr.revision_id = vmcs_config.revision_id; + if (shadow) + vmcs->hdr.shadow_vmcs = 1; return vmcs; } @@ -4459,14 +4461,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } -static struct vmcs *alloc_vmcs(void) +static struct vmcs *alloc_vmcs(bool shadow) { - return alloc_vmcs_cpu(raw_smp_processor_id()); + return alloc_vmcs_cpu(shadow, raw_smp_processor_id()); } static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) { - loaded_vmcs->vmcs = alloc_vmcs(); + loaded_vmcs->vmcs = alloc_vmcs(false); if (!loaded_vmcs->vmcs) return -ENOMEM; @@ -4597,7 +4599,7 @@ static __init int alloc_kvm_area(void) for_each_possible_cpu(cpu) { struct vmcs *vmcs; - vmcs = alloc_vmcs_cpu(cpu); + vmcs = alloc_vmcs_cpu(false, cpu); if (!vmcs) { free_kvm_area(); return -ENOMEM; @@ -7922,11 +7924,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) goto out_cached_shadow_vmcs12; if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(); + shadow_vmcs = alloc_vmcs(true); if (!shadow_vmcs) goto out_shadow_vmcs; - /* mark vmcs as shadow */ - shadow_vmcs->hdr.shadow_vmcs = 1; /* init shadow vmcs */ vmcs_clear(shadow_vmcs); vmx->vmcs01.shadow_vmcs = shadow_vmcs; -- cgit v1.2.1 From abfc52c612ddb101549846688bc87dfedbbfd4f4 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sat, 23 Jun 2018 02:35:13 +0300 Subject: KVM: nVMX: Separate logic allocating shadow vmcs to a function No functionality change. This is done as a preparation for VMCS shadowing virtualization. Signed-off-by: Liran Alon Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 098926a07cf7..cbe31b9d2b81 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7905,10 +7905,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) return 0; } +/* + * Allocate a shadow VMCS and associate it with the currently loaded + * VMCS, unless such a shadow VMCS already exists. The newly allocated + * VMCS is also VMCLEARed, so that it is ready for use. + */ +static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; + + /* + * We should allocate a shadow vmcs for vmcs01 only when L1 + * executes VMXON and free it when L1 executes VMXOFF. + * As it is invalid to execute VMXON twice, we shouldn't reach + * here when vmcs01 already have an allocated shadow vmcs. + */ + WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs); + + if (!loaded_vmcs->shadow_vmcs) { + loaded_vmcs->shadow_vmcs = alloc_vmcs(true); + if (loaded_vmcs->shadow_vmcs) + vmcs_clear(loaded_vmcs->shadow_vmcs); + } + return loaded_vmcs->shadow_vmcs; +} + static int enter_vmx_operation(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmcs *shadow_vmcs; int r; r = alloc_loaded_vmcs(&vmx->nested.vmcs02); @@ -7923,14 +7948,8 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; - if (enable_shadow_vmcs) { - shadow_vmcs = alloc_vmcs(true); - if (!shadow_vmcs) - goto out_shadow_vmcs; - /* init shadow vmcs */ - vmcs_clear(shadow_vmcs); - vmx->vmcs01.shadow_vmcs = shadow_vmcs; - } + if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) + goto out_shadow_vmcs; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); -- cgit v1.2.1 From 42522d08cdba6d8be4247e4f0770f39f4708b71f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 18 Jul 2018 15:57:50 +0800 Subject: KVM: MMU: drop vcpu param in gpte_access It's never used. Drop it. Signed-off-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6288e9d7068e..74996fc2fc97 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -181,7 +181,7 @@ no_present: * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK * to signify readability since it isn't used in the EPT case */ -static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) +static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT @@ -394,8 +394,8 @@ retry_walk: accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ - walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask); - walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask); + walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask); + walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; @@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); gfn = gpte_to_gfn(gpte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); @@ -1002,7 +1002,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte); + pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, -- cgit v1.2.1 From 5ce4786f75d16504223c7a65a42b200c2550fa29 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:04 -0700 Subject: kvm: x86: Make sync_page() flush remote TLBs once only sync_page() calls set_spte() from a loop across a page table. It would work better if set_spte() left the TLB flushing to its callers, so that sync_page() can aggregate into a single call. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 16 ++++++++++++---- arch/x86/kvm/paging_tmpl.h | 12 ++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d594690d8b95..75bc73e23df0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2724,6 +2724,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) return true; } +/* Bits which may be returned by set_spte() */ +#define SET_SPTE_WRITE_PROTECTED_PT BIT(0) +#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, @@ -2800,7 +2804,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); - ret = 1; + ret |= SET_SPTE_WRITE_PROTECTED_PT; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } @@ -2816,7 +2820,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: if (mmu_spte_update(sptep, spte)) - kvm_flush_remote_tlbs(vcpu->kvm); + ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH; done: return ret; } @@ -2827,6 +2831,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, { int was_rmapped = 0; int rmap_count; + int set_spte_ret; int ret = RET_PF_RETRY; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, @@ -2854,12 +2859,15 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, - true, host_writable)) { + set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn, + speculative, true, host_writable); + if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { if (write_fault) ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) + kvm_flush_remote_tlbs(vcpu->kvm); if (unlikely(is_mmio_spte(*sptep))) ret = RET_PF_EMULATE; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 74996fc2fc97..208f7646ce0d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int i, nr_present = 0; bool host_writable; gpa_t first_pte_gpa; + int set_spte_ret = 0; /* direct kvm_mmu_page can not be unsync. */ BUG_ON(sp->role.direct); @@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, - PT_PAGE_TABLE_LEVEL, gfn, - spte_to_pfn(sp->spt[i]), true, false, - host_writable); + set_spte_ret |= set_spte(vcpu, &sp->spt[i], + pte_access, PT_PAGE_TABLE_LEVEL, + gfn, spte_to_pfn(sp->spt[i]), + true, false, host_writable); } + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) + kvm_flush_remote_tlbs(vcpu->kvm); + return nr_present; } -- cgit v1.2.1 From 578e1c4db22135d91bad6c79585c4d19252b8d81 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:05 -0700 Subject: kvm: x86: Avoid taking MMU lock in kvm_mmu_sync_roots if no sync is needed kvm_mmu_sync_roots() can locklessly check whether a sync is needed and just bail out if it isn't. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 75bc73e23df0..4b4452d0022e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2702,6 +2702,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_unsync_page(vcpu, sp); } + /* + * We need to ensure that the marking of unsync pages is visible + * before the SPTE is updated to allow writes because + * kvm_mmu_sync_roots() checks the unsync flags without holding + * the MMU lock and so can race with this. If the SPTE was updated + * before the page had been marked as unsync-ed, something like the + * following could happen: + * + * CPU 1 CPU 2 + * --------------------------------------------------------------------- + * 1.2 Host updates SPTE + * to be writable + * 2.1 Guest writes a GPTE for GVA X. + * (GPTE being in the guest page table shadowed + * by the SP from CPU 1.) + * This reads SPTE during the page table walk. + * Since SPTE.W is read as 1, there is no + * fault. + * + * 2.2 Guest issues TLB flush. + * That causes a VM Exit. + * + * 2.3 kvm_mmu_sync_pages() reads sp->unsync. + * Since it is false, so it just returns. + * + * 2.4 Guest accesses GVA X. + * Since the mapping in the SP was not updated, + * so the old mapping for GVA X incorrectly + * gets used. + * 1.1 Host marks SP + * as unsync + * (sp->unsync = true) + * + * The write barrier below ensures that 1.1 happens before 1.2 and thus + * the situation in 2.4 does not arise. The implicit barrier in 2.2 + * pairs with this write barrier. + */ + smp_wmb(); + return false; } @@ -3554,7 +3593,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) return mmu_alloc_shadow_roots(vcpu); } -static void mmu_sync_roots(struct kvm_vcpu *vcpu) +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; @@ -3566,14 +3605,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) return; vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + + /* + * Even if another CPU was marking the SP as unsync-ed + * simultaneously, any guest page table changes are not + * guaranteed to be visible anyway until this VCPU issues a TLB + * flush strictly after those changes are made. We only need to + * ensure that the other CPU sets these flags before any actual + * changes to the page tables are made. The comments in + * mmu_need_write_protect() describe what could go wrong if this + * requirement isn't satisfied. + */ + if (!smp_load_acquire(&sp->unsync) && + !smp_load_acquire(&sp->unsync_children)) + return; + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + mmu_sync_children(vcpu, sp); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); + spin_unlock(&vcpu->kvm->mmu_lock); return; } + + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3583,13 +3647,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } - kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); -} -void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) -{ - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); + kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); spin_unlock(&vcpu->kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); -- cgit v1.2.1 From 7c390d350f8b677df3236afef4ced80dba6c3201 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:06 -0700 Subject: kvm: x86: Add fast CR3 switch code path When using shadow paging, a CR3 switch in the guest results in a VM Exit. In the common case, that VM exit doesn't require much processing by KVM. However, it does acquire the MMU lock, which can start showing signs of contention under some workloads even on a 2 VCPU VM when the guest is using KPTI. Therefore, we add a fast path that avoids acquiring the MMU lock in the most common cases e.g. when switching back and forth between the kernel and user mode CR3s used by KPTI with no guest page table changes in between. For now, this fast path is implemented only for 64-bit guests and hosts to avoid the handling of PDPTEs, but it can be extended later to 32-bit guests and/or hosts as well. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 13 +++++++-- arch/x86/kvm/mmu.c | 63 +++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/x86.c | 3 +- 3 files changed, 71 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bd287b348751..290b7d05790a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -326,6 +326,14 @@ struct rsvd_bits_validate { u64 bad_mt_xwr; }; +struct kvm_mmu_root_info { + gpa_t cr3; + hpa_t hpa; +}; + +#define KVM_MMU_ROOT_INFO_INVALID \ + ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -354,6 +362,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; + struct kvm_mmu_root_info prev_root; /* * Bitmap; bit set = permission fault @@ -1288,7 +1297,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, @@ -1307,7 +1316,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3); void kvm_enable_tdp(void); void kvm_disable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4b4452d0022e..7f490298c635 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3405,17 +3405,22 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, *root_hpa = INVALID_PAGE; } -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu) +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root) { int i; LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; - if (!VALID_PAGE(mmu->root_hpa)) + if (!VALID_PAGE(mmu->root_hpa) && + (!VALID_PAGE(mmu->prev_root.hpa) || !free_prev_root)) return; spin_lock(&vcpu->kvm->mmu_lock); + if (free_prev_root) + mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa, + &invalid_list); + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); @@ -4015,13 +4020,56 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = true; context->nx = false; } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3) +{ + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + /* + * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid + * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs + * later if necessary. + */ + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + mmu->root_level >= PT64_ROOT_4LEVEL) { + gpa_t prev_cr3 = mmu->prev_root.cr3; + + if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) + return false; + + swap(mmu->root_hpa, mmu->prev_root.hpa); + mmu->prev_root.cr3 = kvm_read_cr3(vcpu); + + if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa)) { + /* + * It is possible that the cached previous root page is + * obsolete because of a change in the MMU + * generation number. However, that is accompanied by + * KVM_REQ_MMU_RELOAD, which will free the root that we + * have set here and allocate a new one. + */ + + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + __clear_sp_write_flooding_count( + page_header(mmu->root_hpa)); + + mmu->set_cr3(vcpu, mmu->root_hpa); + + return true; + } + } + + return false; +} + +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) { - kvm_mmu_free_roots(vcpu); + if (!fast_cr3_switch(vcpu, new_cr3)) + kvm_mmu_free_roots(vcpu, false); } static unsigned long get_cr3(struct kvm_vcpu *vcpu) @@ -4499,6 +4547,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->update_pte = paging64_update_pte; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; + context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; } @@ -4529,6 +4578,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; } @@ -4552,6 +4602,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); context->root_hpa = INVALID_PAGE; + context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -4634,6 +4685,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; context->root_hpa = INVALID_PAGE; + context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; context->base_role.ad_disabled = !accessed_dirty; context->base_role.guest_mode = 1; @@ -4736,7 +4788,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu); + kvm_mmu_free_roots(vcpu, true); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); @@ -5116,6 +5168,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) { vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b14c4a654c3..5a1e4f79398f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -867,9 +867,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; + kvm_mmu_new_cr3(vcpu, cr3); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); - kvm_mmu_new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -- cgit v1.2.1 From 9fa72119b24db78d665a4034b5a0d349b0289b08 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:07 -0700 Subject: kvm: x86: Introduce kvm_mmu_calc_root_page_role() These functions factor out the base role calculation from the corresponding kvm_init_*_mmu() functions. The new functions return what would be the role assigned to a root page in the current VCPU state. This can be masked with mmu_base_role_mask to derive the base role. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 112 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7f490298c635..afb865d054c2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -178,6 +178,17 @@ struct kvm_shadow_walk_iterator { unsigned index; }; +static const union kvm_mmu_page_role mmu_base_role_mask = { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + .smm = 1, + .guest_mode = 1, + .ad_disabled = 1, +}; + #define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ @@ -222,6 +233,8 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; static void mmu_spte_set(u64 *sptep, u64 spte); +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value) { @@ -4588,14 +4601,27 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu, paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } +static union kvm_mmu_page_role +kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_page_role role = {0}; + + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.ad_disabled = (shadow_accessed_mask == 0); + role.level = kvm_x86_ops->get_tdp_level(vcpu); + role.direct = true; + role.access = ACC_ALL; + + return role; +} + static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; - context->base_role.word = 0; - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); - context->base_role.ad_disabled = (shadow_accessed_mask == 0); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_tdp_mmu_root_page_role(vcpu).word; context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -4637,10 +4663,35 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) reset_tdp_shadow_zero_bits_mask(vcpu, context); } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +static union kvm_mmu_page_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu) { + union kvm_mmu_page_role role = {0}; bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + + role.nxe = is_nx(vcpu); + role.cr4_pae = !!is_pae(vcpu); + role.cr0_wp = is_write_protection(vcpu); + role.smep_andnot_wp = smep && !is_write_protection(vcpu); + role.smap_andnot_wp = smap && !is_write_protection(vcpu); + role.guest_mode = is_guest_mode(vcpu); + role.smm = is_smm(vcpu); + role.direct = !is_paging(vcpu); + role.access = ACC_ALL; + + if (!is_long_mode(vcpu)) + role.level = PT32E_ROOT_LEVEL; + else if (is_la57_mode(vcpu)) + role.level = PT64_ROOT_5LEVEL; + else + role.level = PT64_ROOT_4LEVEL; + + return role; +} + +void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) +{ struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -4654,23 +4705,32 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) else paging32_init_context(vcpu, context); - context->base_role.nxe = is_nx(vcpu); - context->base_role.cr4_pae = !!is_pae(vcpu); - context->base_role.cr0_wp = is_write_protection(vcpu); - context->base_role.smep_andnot_wp - = smep && !is_write_protection(vcpu); - context->base_role.smap_andnot_wp - = smap && !is_write_protection(vcpu); - context->base_role.guest_mode = is_guest_mode(vcpu); - context->base_role.smm = is_smm(vcpu); + context->base_role.word = mmu_base_role_mask.word & + kvm_calc_shadow_mmu_root_page_role(vcpu).word; reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); +static union kvm_mmu_page_role +kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) +{ + union kvm_mmu_page_role role = vcpu->arch.mmu.base_role; + + role.level = PT64_ROOT_4LEVEL; + role.direct = false; + role.ad_disabled = !accessed_dirty; + role.guest_mode = true; + role.access = ACC_ALL; + + return role; +} + void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty) { struct kvm_mmu *context = &vcpu->arch.mmu; + union kvm_mmu_page_role root_page_role = + kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -4687,8 +4747,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->root_hpa = INVALID_PAGE; context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; - context->base_role.ad_disabled = !accessed_dirty; - context->base_role.guest_mode = 1; + context->base_role.word = root_page_role.word & mmu_base_role_mask.word; update_permission_bitmask(vcpu, context, true); update_pkru_bitmask(vcpu, context, true); update_last_nonleaf_level(vcpu, context); @@ -4761,6 +4820,15 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu) init_kvm_softmmu(vcpu); } +static union kvm_mmu_page_role +kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) +{ + if (tdp_enabled) + return kvm_calc_tdp_mmu_root_page_role(vcpu); + else + return kvm_calc_shadow_mmu_root_page_role(vcpu); +} + void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); @@ -4941,16 +5009,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush; - union kvm_mmu_page_role mask = { }; - - mask.cr0_wp = 1; - mask.cr4_pae = 1; - mask.nxe = 1; - mask.smep_andnot_wp = 1; - mask.smap_andnot_wp = 1; - mask.smm = 1; - mask.guest_mode = 1; - mask.ad_disabled = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4994,7 +5052,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) - & mask.word) && rmap_can_add(vcpu)) + & mmu_base_role_mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) remote_flush = true; -- cgit v1.2.1 From 6e42782f516f05c8030f63308f2457681b1c9919 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:08 -0700 Subject: kvm: x86: Introduce KVM_REQ_LOAD_CR3 The KVM_REQ_LOAD_CR3 request loads the hardware CR3 using the current root_hpa. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 3 +-- arch/x86/kvm/mmu.h | 7 +++++++ arch/x86/kvm/x86.c | 2 ++ 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 290b7d05790a..c2b4df8a03cd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -54,6 +54,7 @@ #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) +#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index afb865d054c2..704f7df11f0b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4847,8 +4847,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (r) goto out; - /* set_cr3() should ensure TLB has been flushed */ - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + kvm_mmu_load_cr3(vcpu); out: return r; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 5b408c0ad612..16b7178853ac 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -85,6 +85,13 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } +static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) +{ + /* set_cr3() should ensure TLB has been flushed */ + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); +} + /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a1e4f79398f..7748037b17fd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7332,6 +7332,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) + kvm_mmu_load_cr3(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- cgit v1.2.1 From 0aab33e4f9459fc80378bc2a089d5784fe8ccd3b Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:09 -0700 Subject: kvm: x86: Add support for fast CR3 switch across different MMU modes This generalizes the lockless CR3 switch path to be able to work across different MMU modes (e.g. nested vs non-nested) by checking that the expected page role of the new root page matches the page role of the previously stored root page in addition to checking that the new CR3 matches the previous CR3. Furthermore, instead of loading the hardware CR3 in fast_cr3_switch(), it is now done in vcpu_enter_guest(), as by that time the MMU context would be up-to-date with the VCPU mode. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 704f7df11f0b..536473411930 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4038,7 +4038,8 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3) +static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) { struct kvm_mmu *mmu = &vcpu->arch.mmu; @@ -4057,7 +4058,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3) swap(mmu->root_hpa, mmu->prev_root.hpa); mmu->prev_root.cr3 = kvm_read_cr3(vcpu); - if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa)) { + if (new_cr3 == prev_cr3 && + VALID_PAGE(mmu->root_hpa) && + page_header(mmu->root_hpa) != NULL && + new_role.word == page_header(mmu->root_hpa)->role.word) { /* * It is possible that the cached previous root page is * obsolete because of a change in the MMU @@ -4066,12 +4070,11 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3) * have set here and allocate a new one. */ + kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); - mmu->set_cr3(vcpu, mmu->root_hpa); - return true; } } @@ -4079,12 +4082,18 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3) return false; } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) +static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) { - if (!fast_cr3_switch(vcpu, new_cr3)) + if (!fast_cr3_switch(vcpu, new_cr3, new_role)) kvm_mmu_free_roots(vcpu, false); } +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) +{ + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu)); +} + static unsigned long get_cr3(struct kvm_vcpu *vcpu) { return kvm_read_cr3(vcpu); -- cgit v1.2.1 From 1c53da3fa3a333eb15ee5a154700e75d135c21c8 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:10 -0700 Subject: kvm: x86: Support resetting the MMU context without resetting roots This adds support for re-initializing the MMU context in a different mode while preserving the active root_hpa and the prev_root. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 26 +++++++++----------------- arch/x86/kvm/mmu.h | 1 + 2 files changed, 10 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 536473411930..3d822a97dfa1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4032,8 +4032,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = true; context->nx = false; } @@ -4568,8 +4566,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->shadow_root_level = level; - context->root_hpa = INVALID_PAGE; - context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; } @@ -4599,8 +4595,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; context->shadow_root_level = PT32E_ROOT_LEVEL; - context->root_hpa = INVALID_PAGE; - context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; } @@ -4636,8 +4630,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); - context->root_hpa = INVALID_PAGE; - context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; @@ -4703,8 +4695,6 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = &vcpu->arch.mmu; - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - if (!is_paging(vcpu)) nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) @@ -4741,8 +4731,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, union kvm_mmu_page_role root_page_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); - MMU_WARN_ON(VALID_PAGE(context->root_hpa)); - context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -4753,8 +4741,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; context->root_level = PT64_ROOT_4LEVEL; - context->root_hpa = INVALID_PAGE; - context->prev_root = KVM_MMU_ROOT_INFO_INVALID; context->direct_map = false; context->base_role.word = root_page_role.word & mmu_base_role_mask.word; update_permission_bitmask(vcpu, context, true); @@ -4819,8 +4805,13 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) update_last_nonleaf_level(vcpu, g_context); } -static void init_kvm_mmu(struct kvm_vcpu *vcpu) +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) { + if (reset_roots) { + vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; + } + if (mmu_is_nested(vcpu)) init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -4828,6 +4819,7 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu) else init_kvm_softmmu(vcpu); } +EXPORT_SYMBOL_GPL(kvm_init_mmu); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) @@ -4841,7 +4833,7 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); @@ -5245,7 +5237,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - init_kvm_mmu(vcpu); + kvm_init_mmu(vcpu, true); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 16b7178853ac..11ab3d62ad65 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -61,6 +61,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value); void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); +void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); -- cgit v1.2.1 From 50c28f21d045dde8c52548f8482d456b3f0956f5 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:11 -0700 Subject: kvm: x86: Use fast CR3 switch for nested VMX Use the fast CR3 switch mechanism to locklessly change the MMU root page when switching between L1 and L2. The switch from L2 to L1 should always go through the fast path, while the switch from L1 to L2 should go through the fast path if L1's CR3/EPTP for L2 hasn't changed since the last time. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 ++++-- arch/x86/kvm/mmu.h | 2 +- arch/x86/kvm/vmx.c | 16 ++++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d822a97dfa1..9b73cfcef917 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4054,7 +4054,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, return false; swap(mmu->root_hpa, mmu->prev_root.hpa); - mmu->prev_root.cr3 = kvm_read_cr3(vcpu); + mmu->prev_root.cr3 = mmu->get_cr3(vcpu); if (new_cr3 == prev_cr3 && VALID_PAGE(mmu->root_hpa) && @@ -4091,6 +4091,7 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) { __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu)); } +EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); static unsigned long get_cr3(struct kvm_vcpu *vcpu) { @@ -4725,12 +4726,13 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty) } void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty) + bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = &vcpu->arch.mmu; union kvm_mmu_page_role root_page_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11ab3d62ad65..3177127cee96 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,7 +64,7 @@ reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, - bool accessed_dirty); + bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cbe31b9d2b81..c6ea892a610e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10741,11 +10741,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu))) return 1; - kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, - nested_ept_ad_enabled(vcpu)); + nested_ept_ad_enabled(vcpu), + nested_ept_get_cr3(vcpu)); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; @@ -11342,12 +11342,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 1; } } - - vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); } - kvm_mmu_reset_context(vcpu); + if (!nested_ept) + kvm_mmu_new_cr3(vcpu, cr3); + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + kvm_init_mmu(vcpu, false); + return 0; } -- cgit v1.2.1 From afe828d1de4047d26eb0cd0c0154f5ac3722bf63 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:12 -0700 Subject: kvm: x86: Add ability to skip TLB flush when switching CR3 Remove the implicit flush from the set_cr3 handlers, so that the callers are able to decide whether to flush the TLB or not. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/mmu.h | 1 - arch/x86/kvm/svm.c | 4 ---- arch/x86/kvm/vmx.c | 1 - 4 files changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b73cfcef917..d3a04cf6514b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4070,6 +4070,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -4851,6 +4852,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_cr3(vcpu); + kvm_x86_ops->tlb_flush(vcpu, true); out: return r; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 3177127cee96..6a2a97d8015b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -88,7 +88,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) { - /* set_cr3() should ensure TLB has been flushed */ if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..be9931cad409 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -5766,7 +5765,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5779,8 +5777,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) /* Also sync guest cr3 here in case we live migrate */ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - - svm_flush_tlb(vcpu, true); } static int is_disabled(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6ea892a610e..903c130bb811 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5029,7 +5029,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } -- cgit v1.2.1 From c9470a2e28479e97eb44d926ea7bbb5709ad9d6b Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:13 -0700 Subject: kvm: x86: Propagate guest PCIDs to host PCIDs When using shadow paging mode, propagate the guest's PCID value to the shadow CR3 in the host instead of always using PCID 0. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 6a2a97d8015b..1fab69c0b2f3 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -86,10 +86,25 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } +static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3) +{ + BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0); + + return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE) + ? cr3 & X86_CR3_PCID_MASK + : 0; +} + +static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) +{ + return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); +} + static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) { if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) - vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa | + kvm_get_active_pcid(vcpu)); } /* -- cgit v1.2.1 From eb4b248e152d3ecf189b9d32c04961360dbd938a Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:14 -0700 Subject: kvm: vmx: Support INVPCID in shadow paging mode Implement support for INVPCID in shadow paging mode as well. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 18 ++++++++ arch/x86/kvm/vmx.c | 96 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c2b4df8a03cd..1c253f15f9cd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1317,6 +1317,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3); void kvm_enable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3a04cf6514b..2af15db12ccd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5187,6 +5187,24 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); +void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) +{ + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + if (pcid == kvm_get_active_pcid(vcpu)) { + mmu->invlpg(vcpu, gva); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + ++vcpu->stat.invlpg; + + /* + * Mappings not reachable via the current cr3 will be synced when + * switching to that cr3, so nothing needs to be done here for them. + */ +} +EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); + void kvm_enable_tdp(void) { tdp_enabled = true; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 903c130bb811..80be024cb979 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3065,7 +3065,7 @@ static bool vmx_rdtscp_supported(void) static bool vmx_invpcid_supported(void) { - return cpu_has_vmx_invpcid() && enable_ept; + return cpu_has_vmx_invpcid(); } /* @@ -6101,8 +6101,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; - /* Enable INVPCID for non-ept guests may cause performance regression. */ - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -8756,6 +8754,97 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } +static int handle_invpcid(struct kvm_vcpu *vcpu) +{ + u32 vmx_instruction_info; + unsigned long type; + bool pcid_enabled; + gva_t gva; + struct x86_exception e; + struct { + u64 pcid; + u64 gla; + } operand; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); + + if (type > 3) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + /* According to the Intel instruction reference, the memory operand + * is read even if it isn't needed (e.g., for type==all) + */ + if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + vmx_instruction_info, false, &gva)) + return 1; + + if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { + kvm_inject_page_fault(vcpu, &e); + return 1; + } + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + /* + * If the current cr3 does not use the given PCID, then nothing + * needs to be done here because a resync will happen anyway + * before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + /* fall-through */ + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_mmu_unload(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} + static int handle_pml_full(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; @@ -8959,6 +9048,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, + [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmfunc, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; -- cgit v1.2.1 From ade61e2824443a208bb3aaafd8b345ce878298cd Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:15 -0700 Subject: kvm: x86: Skip TLB flush on fast CR3 switch when indicated by guest When PCIDs are enabled, the MSb of the source operand for a MOV-to-CR3 instruction indicates that the TLB doesn't need to be flushed. This change enables this optimization for MOV-to-CR3s in the guest that have been intercepted by KVM for shadow paging and are handled within the fast CR3 switch path. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 28 +++++++++++++++++++--------- arch/x86/kvm/vmx.c | 12 ++++++++---- arch/x86/kvm/x86.c | 11 ++++++++--- 4 files changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1c253f15f9cd..8b4aa5e7ff92 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1318,7 +1318,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); void kvm_enable_tdp(void); void kvm_disable_tdp(void); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2af15db12ccd..2b7cfc8e41bb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4037,7 +4037,8 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, } static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role) + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { struct kvm_mmu *mmu = &vcpu->arch.mmu; @@ -4070,7 +4071,9 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + if (!skip_tlb_flush) + kvm_x86_ops->tlb_flush(vcpu, true); + __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -4082,15 +4085,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, } static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, - union kvm_mmu_page_role new_role) + union kvm_mmu_page_role new_role, + bool skip_tlb_flush) { - if (!fast_cr3_switch(vcpu, new_cr3, new_role)) + if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) kvm_mmu_free_roots(vcpu, false); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) { - __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu)); + __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), + skip_tlb_flush); } EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); @@ -4733,7 +4738,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, union kvm_mmu_page_role root_page_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty); - __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role); + __kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false); context->shadow_root_level = PT64_ROOT_4LEVEL; context->nx = true; @@ -5196,11 +5201,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (VALID_PAGE(mmu->prev_root.hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + ++vcpu->stat.invlpg; /* - * Mappings not reachable via the current cr3 will be synced when - * switching to that cr3, so nothing needs to be done here for them. + * Mappings not reachable via the current cr3 or the prev_root.cr3 will + * be synced when switching to that cr3, so nothing needs to be done + * here for them. */ } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 80be024cb979..6151418cec32 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8819,10 +8819,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) + == operand.pcid) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + /* - * If the current cr3 does not use the given PCID, then nothing - * needs to be done here because a resync will happen anyway - * before switching to any other CR3. + * If neither the current cr3 nor the prev_root.cr3 use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. */ return kvm_skip_emulated_instruction(vcpu); @@ -11434,7 +11438,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne } if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3); + kvm_mmu_new_cr3(vcpu, cr3, false); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7748037b17fd..493afbf12e78 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -847,16 +847,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + bool skip_tlb_flush = false; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); - if (pcid_enabled) + if (pcid_enabled) { + skip_tlb_flush = cr3 & CR3_PCID_INVD; cr3 &= ~CR3_PCID_INVD; + } #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + if (!skip_tlb_flush) + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); return 0; } @@ -867,7 +872,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3); + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -- cgit v1.2.1 From 7eb77e9f5fcf652a21b2d12bff1cd509b6b14f21 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:16 -0700 Subject: kvm: x86: Add a root_hpa parameter to kvm_mmu->invlpg() This allows invlpg() to be called using either the active root_hpa or the prev_root_hpa. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 36 +++++++++++++++++++++++++++++------- arch/x86/kvm/paging_tmpl.h | 6 +++--- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8b4aa5e7ff92..0b77c233e441 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -354,7 +354,7 @@ struct kvm_mmu { struct x86_exception *exception); int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); - void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2b7cfc8e41bb..6eeca915511e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -189,7 +189,13 @@ static const union kvm_mmu_page_role mmu_base_role_mask = { .ad_disabled = 1, }; -#define for_each_shadow_entry(_vcpu, _addr, _walker) \ +#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ + for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ + (_root), (_addr)); \ + shadow_walk_okay(&(_walker)); \ + shadow_walk_next(&(_walker))) + +#define for_each_shadow_entry(_vcpu, _addr, _walker) \ for (shadow_walk_init(&(_walker), _vcpu, _addr); \ shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) @@ -1999,7 +2005,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) { } @@ -2405,11 +2411,12 @@ out: return sp; } -static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, - struct kvm_vcpu *vcpu, u64 addr) +static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, hpa_t root, + u64 addr) { iterator->addr = addr; - iterator->shadow_addr = vcpu->arch.mmu.root_hpa; + iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu.shadow_root_level; if (iterator->level == PT64_ROOT_4LEVEL && @@ -2418,6 +2425,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, --iterator->level; if (iterator->level == PT32E_ROOT_LEVEL) { + /* + * prev_root is currently only used for 64-bit hosts. So only + * the active root_hpa is valid here. + */ + BUG_ON(root != vcpu->arch.mmu.root_hpa); + iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; iterator->shadow_addr &= PT64_BASE_ADDR_MASK; @@ -2427,6 +2440,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, } } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, + struct kvm_vcpu *vcpu, u64 addr) +{ + shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa, + addr); +} + static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { if (iterator->level < PT_PAGE_TABLE_LEVEL) @@ -5186,7 +5206,9 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - vcpu->arch.mmu.invlpg(vcpu, gva); + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + mmu->invlpg(vcpu, gva, mmu->root_hpa); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } @@ -5197,7 +5219,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) struct kvm_mmu *mmu = &vcpu->arch.mmu; if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva); + mmu->invlpg(vcpu, gva, mmu->root_hpa); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 208f7646ce0d..14ffd973df54 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) */ mmu_topup_memory_caches(vcpu); - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (!VALID_PAGE(root_hpa)) { WARN_ON(1); return; } spin_lock(&vcpu->kvm->mmu_lock); - for_each_shadow_entry(vcpu, gva, iterator) { + for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) { level = iterator.level; sptep = iterator.sptep; -- cgit v1.2.1 From 08fb59d8a47d5e1f9de08659603a47f117fe60d5 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:17 -0700 Subject: kvm: x86: Support selectively freeing either current or previous MMU root kvm_mmu_free_roots() now takes a mask specifying which roots to free, so that either one of the roots (active/previous) can be individually freed when needed. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0b77c233e441..262b0bc64dfc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1287,6 +1287,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS BIT(1) +#define KVM_MMU_ROOTS_ALL (~0UL) + int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); @@ -1298,7 +1302,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root); +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free); gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6eeca915511e..0f6965ce016a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3438,14 +3438,18 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, *root_hpa = INVALID_PAGE; } -void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root) +/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ +void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) { int i; LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; + bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; - if (!VALID_PAGE(mmu->root_hpa) && - (!VALID_PAGE(mmu->prev_root.hpa) || !free_prev_root)) + /* Before acquiring the MMU lock, see if we need to do any real work. */ + if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) && + !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa))) return; spin_lock(&vcpu->kvm->mmu_lock); @@ -3454,15 +3458,19 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, bool free_prev_root) mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa, &invalid_list); - if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && - (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { - mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list); - } else { - for (i = 0; i < 4; ++i) - if (mmu->pae_root[i] != 0) - mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i], - &invalid_list); - mmu->root_hpa = INVALID_PAGE; + if (free_active_root) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && + (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { + mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, + &invalid_list); + } else { + for (i = 0; i < 4; ++i) + if (mmu->pae_root[i] != 0) + mmu_free_root_page(vcpu->kvm, + &mmu->pae_root[i], + &invalid_list); + mmu->root_hpa = INVALID_PAGE; + } } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -4109,7 +4117,7 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) { if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush)) - kvm_mmu_free_roots(vcpu, false); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT); } void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) @@ -4885,7 +4893,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load); void kvm_mmu_unload(struct kvm_vcpu *vcpu) { - kvm_mmu_free_roots(vcpu, true); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL); WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); } EXPORT_SYMBOL_GPL(kvm_mmu_unload); -- cgit v1.2.1 From 956bf3531fba53c0501eda4fbc67950b0f7b913f Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:18 -0700 Subject: kvm: x86: Skip shadow page resync on CR3 switch when indicated by guest When the guest indicates that the TLB doesn't need to be flushed in a CR3 switch, we can also skip resyncing the shadow page tables since an out-of-sync shadow page table is equivalent to an out-of-sync TLB. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++--- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- 3 files changed, 34 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0f6965ce016a..9446a36a4ab7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4098,9 +4098,19 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); - kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - if (!skip_tlb_flush) + if (!skip_tlb_flush) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_x86_ops->tlb_flush(vcpu, true); + } + + /* + * The last MMIO access's GVA and GPA are cached in the + * VCPU. When switching to a new CR3, that GVA->GPA + * mapping may no longer be valid. So clear any cached + * MMIO info even when we don't need to sync the shadow + * page tables. + */ + vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); __clear_sp_write_flooding_count( page_header(mmu->root_hpa)); @@ -5217,6 +5227,21 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) struct kvm_mmu *mmu = &vcpu->arch.mmu; mmu->invlpg(vcpu, gva, mmu->root_hpa); + + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether the prev_root mapping of the VA is + * marked global, or to just sync it blindly, so we might as well just + * always sync it. + * + * Mappings not reachable via the current cr3 or the prev_root.cr3 will + * be synced when switching to that cr3, so nothing needs to be done + * here for them. + */ + if (VALID_PAGE(mmu->prev_root.hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); ++vcpu->stat.invlpg; } @@ -5232,8 +5257,10 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (VALID_PAGE(mmu->prev_root.hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) + pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } ++vcpu->stat.invlpg; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6151418cec32..b81210051133 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8821,7 +8821,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) == operand.pcid) - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS); /* * If neither the current cr3 nor the prev_root.cr3 use the diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 493afbf12e78..aa5d96b4b386 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -858,10 +858,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) #endif if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - kvm_mmu_sync_roots(vcpu); - - if (!skip_tlb_flush) + if (!skip_tlb_flush) { + kvm_mmu_sync_roots(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } return 0; } -- cgit v1.2.1 From faff87588d8bfd9e56e9203412f0bb80455da7b9 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 29 Jun 2018 13:10:05 -0700 Subject: kvm: x86: Flush only affected TLB entries in kvm_mmu_invlpg* This needs a minor bug fix. The updated patch is as follows. Thanks, Junaid ------------------------------------------------------------------------------ kvm_mmu_invlpg() and kvm_mmu_invpcid_gva() only need to flush the TLB entries for the specific guest virtual address, instead of flushing all TLB entries associated with the VM. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++++++++ arch/x86/kvm/mmu.c | 14 +++++++++++--- arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 262b0bc64dfc..dcdc9150bb76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -985,6 +985,14 @@ struct kvm_x86_ops { void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + /* + * Flush any TLB entries associated with the given GVA. + * Does not need to flush GPA->HPA mappings. + * Can potentially get non-canonical addresses through INVLPGs, which + * the implementation may choose to ignore if appropriate. + */ + void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9446a36a4ab7..f84c194e6db1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5226,6 +5226,10 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + mmu->invlpg(vcpu, gva, mmu->root_hpa); /* @@ -5242,7 +5246,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) if (VALID_PAGE(mmu->prev_root.hpa)) mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5250,18 +5254,22 @@ EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + bool tlb_flush = false; if (pcid == kvm_get_active_pcid(vcpu)) { mmu->invlpg(vcpu, gva, mmu->root_hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + tlb_flush = true; } if (VALID_PAGE(mmu->prev_root.hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + tlb_flush = true; } + if (tlb_flush) + kvm_x86_ops->tlb_flush_gva(vcpu, gva); + ++vcpu->stat.invlpg; /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be9931cad409..73e27a98456f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5434,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) svm->asid_generation--; } +static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + invlpga(gva, svm->vmcb->control.asid); +} + static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } @@ -7086,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_rflags = svm_set_rflags, .tlb_flush = svm_flush_tlb, + .tlb_flush_gva = svm_flush_tlb_gva, .run = svm_vcpu_run, .handle_exit = handle_exit, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b81210051133..5aea5af02386 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1992,6 +1992,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) __loaded_vmcs_clear, loaded_vmcs, 1); } +static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr) +{ + if (vpid == 0) + return true; + + if (cpu_has_vmx_invvpid_individual_addr()) { + __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr); + return true; + } + + return false; +} + static inline void vpid_sync_vcpu_single(int vpid) { if (vpid == 0) @@ -4833,6 +4846,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } +static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) +{ + int vpid = to_vmx(vcpu)->vpid; + + if (!vpid_sync_vcpu_addr(vpid, addr)) + vpid_sync_context(vpid); + + /* + * If VPIDs are not supported or enabled, then the above is a no-op. + * But we don't really need a TLB flush in that case anyway, because + * each VM entry/exit includes an implicit flush when VPID is 0. + */ +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -13603,6 +13630,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .set_rflags = vmx_set_rflags, .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, -- cgit v1.2.1 From b94742c958f0b97d304d4aecb4603a20ee9a2df3 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:20 -0700 Subject: kvm: x86: Add multi-entry LRU cache for previous CR3s Adds support for storing multiple previous CR3/root_hpa pairs maintained as an LRU cache, so that the lockless CR3 switch path can be used when switching back to any of them. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 10 ++-- arch/x86/kvm/mmu.c | 111 ++++++++++++++++++++++++++++------------ arch/x86/kvm/vmx.c | 12 +++-- 3 files changed, 92 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dcdc9150bb76..4f1983640bda 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -335,6 +335,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_ROOT_INFO_INVALID \ ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) +#define KVM_MMU_NUM_PREV_ROOTS 3 + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -363,7 +365,7 @@ struct kvm_mmu { u8 shadow_root_level; u8 ept_ad; bool direct_map; - struct kvm_mmu_root_info prev_root; + struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault @@ -1295,9 +1297,9 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state, return !!(*irq_state); } -#define KVM_MMU_ROOT_CURRENT BIT(0) -#define KVM_MMU_ROOT_PREVIOUS BIT(1) -#define KVM_MMU_ROOTS_ALL (~0UL) +#define KVM_MMU_ROOT_CURRENT BIT(0) +#define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) +#define KVM_MMU_ROOTS_ALL (~0UL) int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f84c194e6db1..687bfb03b9ca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3445,18 +3445,26 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free) LIST_HEAD(invalid_list); struct kvm_mmu *mmu = &vcpu->arch.mmu; bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; - bool free_prev_root = roots_to_free & KVM_MMU_ROOT_PREVIOUS; + + BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); /* Before acquiring the MMU lock, see if we need to do any real work. */ - if (!(free_active_root && VALID_PAGE(mmu->root_hpa)) && - !(free_prev_root && VALID_PAGE(mmu->prev_root.hpa))) - return; + if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && + VALID_PAGE(mmu->prev_roots[i].hpa)) + break; + + if (i == KVM_MMU_NUM_PREV_ROOTS) + return; + } spin_lock(&vcpu->kvm->mmu_lock); - if (free_prev_root) - mmu_free_root_page(vcpu->kvm, &mmu->prev_root.hpa, - &invalid_list); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) + mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa, + &invalid_list); if (free_active_root) { if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && @@ -4064,6 +4072,38 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } +/* + * Find out if a previously cached root matching the new CR3/role is available. + * The current root is also inserted into the cache. + * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is + * returned. + * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and + * false is returned. This root should now be freed by the caller. + */ +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, + union kvm_mmu_page_role new_role) +{ + uint i; + struct kvm_mmu_root_info root; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + + root.cr3 = mmu->get_cr3(vcpu); + root.hpa = mmu->root_hpa; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + swap(root, mmu->prev_roots[i]); + + if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && + page_header(root.hpa) != NULL && + new_role.word == page_header(root.hpa)->role.word) + break; + } + + mmu->root_hpa = root.hpa; + + return i < KVM_MMU_NUM_PREV_ROOTS; +} + static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, union kvm_mmu_page_role new_role, bool skip_tlb_flush) @@ -4077,18 +4117,10 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) { - gpa_t prev_cr3 = mmu->prev_root.cr3; - if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT)) return false; - swap(mmu->root_hpa, mmu->prev_root.hpa); - mmu->prev_root.cr3 = mmu->get_cr3(vcpu); - - if (new_cr3 == prev_cr3 && - VALID_PAGE(mmu->root_hpa) && - page_header(mmu->root_hpa) != NULL && - new_role.word == page_header(mmu->root_hpa)->role.word) { + if (cached_root_available(vcpu, new_cr3, new_role)) { /* * It is possible that the cached previous root page is * obsolete because of a change in the MMU @@ -4854,8 +4886,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots) { if (reset_roots) { + uint i; + vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; } if (mmu_is_nested(vcpu)) @@ -5225,6 +5261,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_mmu *mmu = &vcpu->arch.mmu; + int i; /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ if (is_noncanonical_address(gva, vcpu)) @@ -5235,16 +5272,17 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) /* * INVLPG is required to invalidate any global mappings for the VA, * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether the prev_root mapping of the VA is - * marked global, or to just sync it blindly, so we might as well just - * always sync it. + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. * - * Mappings not reachable via the current cr3 or the prev_root.cr3 will - * be synced when switching to that cr3, so nothing needs to be done - * here for them. + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. */ - if (VALID_PAGE(mmu->prev_root.hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); kvm_x86_ops->tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; @@ -5255,16 +5293,19 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = &vcpu->arch.mmu; bool tlb_flush = false; + uint i; if (pcid == kvm_get_active_pcid(vcpu)) { mmu->invlpg(vcpu, gva, mmu->root_hpa); tlb_flush = true; } - if (VALID_PAGE(mmu->prev_root.hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_root.cr3)) { - mmu->invlpg(vcpu, gva, mmu->prev_root.hpa); - tlb_flush = true; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + if (VALID_PAGE(mmu->prev_roots[i].hpa) && + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + tlb_flush = true; + } } if (tlb_flush) @@ -5273,9 +5314,9 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) ++vcpu->stat.invlpg; /* - * Mappings not reachable via the current cr3 or the prev_root.cr3 will - * be synced when switching to that cr3, so nothing needs to be done - * here for them. + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. */ } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); @@ -5321,12 +5362,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) int kvm_mmu_create(struct kvm_vcpu *vcpu) { + uint i; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; - vcpu->arch.mmu.prev_root = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.mmu.translate_gpa = translate_gpa; vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; + return alloc_mmu_pages(vcpu); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5aea5af02386..97e4d71431a1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8788,6 +8788,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) bool pcid_enabled; gva_t gva; struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; struct { u64 pcid; u64 gla; @@ -8846,12 +8848,14 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_root.cr3) - == operand.pcid) - kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_PREVIOUS); + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + kvm_mmu_free_roots(vcpu, roots_to_free); /* - * If neither the current cr3 nor the prev_root.cr3 use the + * If neither the current cr3 nor any of the prev_roots use the * given PCID, then nothing needs to be done here because a * resync will happen anyway before switching to any other CR3. */ -- cgit v1.2.1 From 208320ba103e01fd2f3a7b81e97c9c5bc85f0612 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 27 Jun 2018 14:59:21 -0700 Subject: kvm: x86: Remove CR3_PCID_INVD flag It is a duplicate of X86_CR3_PCID_NOFLUSH. So just use that instead. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f1983640bda..75ab578c3f6a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -83,7 +83,6 @@ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR3_PCID_INVD BIT_64(63) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4c4f4263420c..106482da6388 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) maxphyaddr = 36; rsvd = rsvd_bits(maxphyaddr, 63); if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE) - rsvd &= ~CR3_PCID_INVD; + rsvd &= ~X86_CR3_PCID_NOFLUSH; } if (new_val & rsvd) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index aa5d96b4b386..6cc29dd21519 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -852,8 +852,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); if (pcid_enabled) { - skip_tlb_flush = cr3 & CR3_PCID_INVD; - cr3 &= ~CR3_PCID_INVD; + skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; + cr3 &= ~X86_CR3_PCID_NOFLUSH; } #endif -- cgit v1.2.1 From 450917b654c12075bd2b07d1e375dfed1bc9ca1e Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Wed, 18 Jul 2018 06:12:04 +0000 Subject: KVM/MMU: Simplify __kvm_sync_page() function Merge check of "sp->role.cr4_pae != !!is_pae(vcpu))" and "vcpu-> arch.mmu.sync_page(vcpu, sp) == 0". kvm_mmu_prepare_zap_page() is called under both these conditions. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 687bfb03b9ca..22a7984c68a5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2136,12 +2136,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list) { - if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); - return false; - } - - if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { + if (sp->role.cr4_pae != !!is_pae(vcpu) + || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return false; } -- cgit v1.2.1 From 3553ae5690a84a5baae5baa329467b3df2d99f72 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 Jul 2018 17:59:27 -0400 Subject: x86/kvm: Don't use pvqspinlock code if only 1 vCPU On a VM with only 1 vCPU, the locking fast path will always be successful. In this case, there is no need to use the the PV qspinlock code which has higher overhead on the unlock side than the native qspinlock code. Signed-off-by: Waiman Long Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..575c9a51e6ba 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -748,6 +748,10 @@ void __init kvm_spinlock_init(void) if (kvm_para_has_hint(KVM_HINTS_REALTIME)) return; + /* Don't use the pvqspinlock code if there is only 1 vCPU. */ + if (num_possible_cpus() == 1) + return; + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); -- cgit v1.2.1 From eb914cfe72f4c948b2318b1381f6d2e08d43b63c Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 19 Jul 2018 08:40:06 +0000 Subject: X86/Hyper-V: Add flush HvFlushGuestPhysicalAddressSpace hypercall support Hyper-V supports a pv hypercall HvFlushGuestPhysicalAddressSpace to flush nested VM address space mapping in l1 hypervisor and it's to reduce overhead of flushing ept tlb among vcpus. This patch is to implement it. Signed-off-by: Lan Tianyu Acked-by: K. Y. Srinivasan Signed-off-by: Paolo Bonzini --- arch/x86/hyperv/Makefile | 2 +- arch/x86/hyperv/nested.c | 53 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/hyperv-tlfs.h | 8 ++++++ arch/x86/include/asm/mshyperv.h | 2 ++ 4 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 arch/x86/hyperv/nested.c (limited to 'arch/x86') diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile index b173d404e3df..b21ee65c4101 100644 --- a/arch/x86/hyperv/Makefile +++ b/arch/x86/hyperv/Makefile @@ -1,2 +1,2 @@ -obj-y := hv_init.o mmu.o +obj-y := hv_init.o mmu.o nested.o obj-$(CONFIG_X86_64) += hv_apic.o diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c new file mode 100644 index 000000000000..08f914217518 --- /dev/null +++ b/arch/x86/hyperv/nested.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Hyper-V nested virtualization code. + * + * Copyright (C) 2018, Microsoft, Inc. + * + * Author : Lan Tianyu + */ + + +#include +#include +#include +#include + +int hyperv_flush_guest_mapping(u64 as) +{ + struct hv_guest_mapping_flush **flush_pcpu; + struct hv_guest_mapping_flush *flush; + u64 status; + unsigned long flags; + int ret = -ENOTSUPP; + + if (!hv_hypercall_pg) + goto fault; + + local_irq_save(flags); + + flush_pcpu = (struct hv_guest_mapping_flush **) + this_cpu_ptr(hyperv_pcpu_input_arg); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto fault; + } + + flush->address_space = as; + flush->flags = 0; + + status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE, + flush, NULL); + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + ret = 0; + +fault: + return ret; +} +EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index b8c89265baf0..08e24f552030 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -309,6 +309,7 @@ struct ms_hyperv_tsc_page { #define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 /* Nested features (CPUID 0x4000000A) EAX */ +#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18) #define HV_X64_NESTED_MSR_BITMAP BIT(19) struct hv_reenlightenment_control { @@ -350,6 +351,7 @@ struct hv_tsc_emulation_status { #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 @@ -741,6 +743,12 @@ struct ipi_arg_ex { struct hv_vpset vp_set; }; +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +}; + /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ struct hv_tlb_flush { u64 address_space; diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 5a7375ed5f7c..2519f5de1e57 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -305,6 +305,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs); void set_hv_tscchange_cb(void (*cb)(void)); void clear_hv_tscchange_cb(void); void hyperv_stop_tsc_emulation(void); +int hyperv_flush_guest_mapping(u64 as); #ifdef CONFIG_X86_64 void hv_apic_init(void); @@ -324,6 +325,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu) { return NULL; } +static inline int hyperv_flush_guest_mapping(u64 as) { return -1; } #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE -- cgit v1.2.1 From 60cfce4c4f6f6741882032ee6f895e835533a16b Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 19 Jul 2018 08:40:12 +0000 Subject: X86/Hyper-V: Add hyperv_nested_flush_guest_mapping ftrace support This patch is to add hyperv_nested_flush_guest_mapping support to trace hvFlushGuestPhysicalAddressSpace hypercall. Signed-off-by: Lan Tianyu Acked-by: K. Y. Srinivasan Signed-off-by: Paolo Bonzini --- arch/x86/hyperv/nested.c | 3 +++ arch/x86/include/asm/trace/hyperv.h | 14 ++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c index 08f914217518..b8e60cc50461 100644 --- a/arch/x86/hyperv/nested.c +++ b/arch/x86/hyperv/nested.c @@ -14,6 +14,8 @@ #include #include +#include + int hyperv_flush_guest_mapping(u64 as) { struct hv_guest_mapping_flush **flush_pcpu; @@ -48,6 +50,7 @@ int hyperv_flush_guest_mapping(u64 as) ret = 0; fault: + trace_hyperv_nested_flush_guest_mapping(as, ret); return ret; } EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping); diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index 4253bca99989..e1ffe61de8d6 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -28,6 +28,20 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others, __entry->addr, __entry->end) ); +TRACE_EVENT(hyperv_nested_flush_guest_mapping, + TP_PROTO(u64 as, int ret), + TP_ARGS(as, ret), + + TP_STRUCT__entry( + __field(u64, as) + __field(int, ret) + ), + TP_fast_assign(__entry->as = as; + __entry->ret = ret; + ), + TP_printk("address space %llx ret %d", __entry->as, __entry->ret) + ); + #endif /* CONFIG_HYPERV */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.1 From b08660e59dbdb600c55953787ed2265a0b510f77 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 19 Jul 2018 08:40:17 +0000 Subject: KVM: x86: Add tlb remote flush callback in kvm_x86_ops. This patch is to provide a way for platforms to register hv tlb remote flush callback and this helps to optimize operation of tlb flush among vcpus for nested virtualization case. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 75ab578c3f6a..150937e64f63 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -985,6 +985,7 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + int (*tlb_remote_flush)(struct kvm *kvm); /* * Flush any TLB entries associated with the given GVA. @@ -1145,6 +1146,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm) return kvm_x86_ops->vm_free(kvm); } +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB +static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) +{ + if (kvm_x86_ops->tlb_remote_flush && + !kvm_x86_ops->tlb_remote_flush(kvm)) + return 0; + else + return -ENOTSUPP; +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); -- cgit v1.2.1 From 877ad952be3d51445f6a74dd63708a9327c8f19d Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 19 Jul 2018 08:40:23 +0000 Subject: KVM: vmx: Add tlb_remote_flush callback support Register tlb_remote_flush callback for vmx when hyperv capability of nested guest mapping flush is detected. The interface can help to reduce overhead when flush ept table among vcpus for nested VM. The tradition way is to send IPIs to all affected vcpus and executes INVEPT on each vcpus. It will trigger several vmexits for IPI and INVEPT emulation. Hyper-V provides such hypercall to do flush for all vcpus and call the hypercall when all ept table pointers of single VM are same. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 97e4d71431a1..6a95e6b1a43d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -188,12 +188,21 @@ module_param(ple_window_max, uint, 0444); extern const ulong vmx_return; +enum ept_pointers_status { + EPT_POINTERS_CHECK = 0, + EPT_POINTERS_MATCH = 1, + EPT_POINTERS_MISMATCH = 2 +}; + struct kvm_vmx { struct kvm kvm; unsigned int tss_addr; bool ept_identity_pagetable_done; gpa_t ept_identity_map_addr; + + enum ept_pointers_status ept_pointers_match; + spinlock_t ept_pointer_lock; }; #define NR_AUTOLOAD_MSRS 8 @@ -863,6 +872,7 @@ struct vcpu_vmx { */ u64 msr_ia32_feature_control; u64 msr_ia32_feature_control_valid_bits; + u64 ept_pointer; }; enum segment_cache_field { @@ -1357,6 +1367,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) * GUEST_IA32_RTIT_CTL = 0x00002814, */ } + +/* check_ept_pointer() should be under protection of ept_pointer_lock. */ +static void check_ept_pointer_match(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + u64 tmp_eptp = INVALID_PAGE; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!VALID_PAGE(tmp_eptp)) { + tmp_eptp = to_vmx(vcpu)->ept_pointer; + } else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) { + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_MISMATCH; + return; + } + } + + to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH; +} + +static int vmx_hv_remote_flush_tlb(struct kvm *kvm) +{ + int ret; + + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + + if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK) + check_ept_pointer_match(kvm); + + if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) { + ret = -ENOTSUPP; + goto out; + } + + ret = hyperv_flush_guest_mapping( + to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer); + +out: + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + return ret; +} #else /* !IS_ENABLED(CONFIG_HYPERV) */ static inline void evmcs_write64(unsigned long field, u64 value) {} static inline void evmcs_write32(unsigned long field, u32 value) {} @@ -5041,6 +5093,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm *kvm = vcpu->kvm; unsigned long guest_cr3; u64 eptp; @@ -5048,11 +5101,20 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (enable_ept) { eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); + + if (kvm_x86_ops->tlb_remote_flush) { + spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); + to_vmx(vcpu)->ept_pointer = eptp; + to_kvm_vmx(kvm)->ept_pointers_match + = EPT_POINTERS_CHECK; + spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); + } + if (enable_unrestricted_guest || is_paging(vcpu) || is_guest_mode(vcpu)) guest_cr3 = kvm_read_cr3(vcpu); else - guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr; + guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; ept_load_pdptrs(vcpu); } @@ -7622,6 +7684,12 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); +#if IS_ENABLED(CONFIG_HYPERV) + if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH + && enable_ept) + kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb; +#endif + if (!cpu_has_vmx_ple()) { ple_gap = 0; ple_window = 0; @@ -10665,6 +10733,8 @@ free_vcpu: static int vmx_vm_init(struct kvm *kvm) { + spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock); + if (!ple_gap) kvm->arch.pause_in_guest = true; return 0; -- cgit v1.2.1 From 36090bf43a6b835a42f515cb515ff6fa293a25fe Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 27 Jul 2018 09:18:50 -0700 Subject: kvm: nVMX: Fix fault vector for VMX operation at CPL > 0 The fault that should be raised for a privilege level violation is #GP rather than #UD. Fixes: 727ba748e110b4 ("kvm: nVMX: Enforce cpl=0 for VMX instructions") Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a95e6b1a43d..d19bb602ff07 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8096,7 +8096,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) /* CPL=0 must be checked manually. */ if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 1; } @@ -8160,7 +8160,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { if (vmx_get_cpl(vcpu)) { - kvm_queue_exception(vcpu, UD_VECTOR); + kvm_inject_gp(vcpu, 0); return 0; } -- cgit v1.2.1 From e49fcb8b9ef26dfb2d02b173d790e1ef41177121 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 27 Jul 2018 13:44:45 -0700 Subject: kvm: nVMX: Fix fault priority for VMX operations When checking emulated VMX instructions for faults, the #UD for "IF (not in VMX operation)" should take precedence over the #GP for "ELSIF CPL > 0." Suggested-by: Eric Northup Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d19bb602ff07..ccc9d75124a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8159,15 +8159,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu) */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); + if (!to_vmx(vcpu)->nested.vmxon) { + kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - if (!to_vmx(vcpu)->nested.vmxon) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (vmx_get_cpl(vcpu)) { + kvm_inject_gp(vcpu, 0); return 0; } + return 1; } -- cgit v1.2.1 From e368b875a8a91125f1bd0ffac1100c305c0bdff4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:41 -0700 Subject: KVM: vmx: refactor segmentation code in vmx_save_host_state() Use local variables in vmx_save_host_state() to temporarily track the selector and base values for FS and GS, and reorganize the code so that the 64-bit vs 32-bit portions are contained within a single #ifdef. This refactoring paves the way for future patches to modify the updating of VMCS state with minimal changes to the code, and (hopefully) simplifies resolving a likely conflict with another in-flight patch[1] by being the whipping boy for future patches. [1] https://www.spinics.net/lists/kvm/msg171647.html Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ccc9d75124a6..06fd598ee8a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2662,8 +2662,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); - unsigned long fs_base, kernel_gs_base; #endif + unsigned long fs_base, gs_base; + u16 fs_sel, gs_sel; int i; if (vmx->host_state.loaded) @@ -2678,49 +2679,51 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 + savesegment(ds, vmx->host_state.ds_sel); + savesegment(es, vmx->host_state.es_sel); + + gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { save_fsgs_for_kvm(); - vmx->host_state.fs_sel = current->thread.fsindex; - vmx->host_state.gs_sel = current->thread.gsindex; + fs_sel = current->thread.fsindex; + gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; - kernel_gs_base = current->thread.gsbase; + vmx->msr_host_kernel_gs_base = current->thread.gsbase; } else { -#endif - savesegment(fs, vmx->host_state.fs_sel); - savesegment(gs, vmx->host_state.gs_sel); -#ifdef CONFIG_X86_64 + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); fs_base = read_msr(MSR_FS_BASE); - kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); + vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE); } + + if (is_long_mode(&vmx->vcpu)) + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#else + savesegment(fs, fs_sel); + savesegment(gs, gs_sel); + fs_base = segment_base(fs_sel); + gs_base = segment_base(gs_sel); #endif - if (!(vmx->host_state.fs_sel & 7)) { - vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); + + vmx->host_state.fs_sel = fs_sel; + if (!(fs_sel & 7)) { + vmcs_write16(HOST_FS_SELECTOR, fs_sel); vmx->host_state.fs_reload_needed = 0; } else { vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - if (!(vmx->host_state.gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); + vmx->host_state.gs_sel = gs_sel; + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); else { vmcs_write16(HOST_GS_SELECTOR, 0); vmx->host_state.gs_ldt_reload_needed = 1; } -#ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu)); + vmcs_writel(HOST_GS_BASE, gs_base); - vmx->msr_host_kernel_gs_base = kernel_gs_base; - if (is_long_mode(&vmx->vcpu)) - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); -#else - vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); - vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); -#endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, vmx->guest_msrs[i].data, -- cgit v1.2.1 From bd9966de4e14fb559e89a06f7f5c9aab2cc028b9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:42 -0700 Subject: KVM: vmx: track host_state.loaded using a loaded_vmcs pointer Using 'struct loaded_vmcs*' to track whether the CPU registers contain host or guest state kills two birds with one stone. 1. The (effective) boolean host_state.loaded is poorly named. It does not track whether or not host state is loaded into the CPU registers (which most readers would expect), but rather tracks if host state has been saved AND guest state is loaded. 2. Using a loaded_vmcs pointer provides a more robust framework for the optimized guest/host state switching, especially when consideration per-VMCS enhancements. To that end, WARN_ONCE if we try to switch to host state with a different VMCS than was last used to save host state. Resolve an occurrence of the new WARN by setting loaded_vmcs after the call to vmx_vcpu_put() in vmx_switch_vmcs(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 06fd598ee8a6..935b84f40f65 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -804,18 +804,22 @@ struct vcpu_vmx { /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested - * guest (L2), it points to a different VMCS. + * guest (L2), it points to a different VMCS. loaded_cpu_state points + * to the VMCS whose state is loaded into the CPU registers that only + * need to be switched when transitioning to/from the kernel; a NULL + * value indicates that host state is loaded. */ struct loaded_vmcs vmcs01; struct loaded_vmcs *loaded_vmcs; + struct loaded_vmcs *loaded_cpu_state; bool __launched; /* temporary, used in vmx_vcpu_run */ struct msr_autoload { unsigned nr; struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; } msr_autoload; + struct { - int loaded; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 u16 ds_sel, es_sel; @@ -2667,10 +2671,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; - if (vmx->host_state.loaded) + if (vmx->loaded_cpu_state) return; - vmx->host_state.loaded = 1; + vmx->loaded_cpu_state = vmx->loaded_vmcs; + /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. @@ -2732,11 +2737,14 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - if (!vmx->host_state.loaded) + if (!vmx->loaded_cpu_state) return; + WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + ++vmx->vcpu.stat.host_state_reload; - vmx->host_state.loaded = 0; + vmx->loaded_cpu_state = NULL; + #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); @@ -10596,8 +10604,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); + vmx->loaded_vmcs = vmcs; vmx_vcpu_load(vcpu, cpu); put_cpu(); } -- cgit v1.2.1 From 678e315e78a780dbef384b92339c8414309dbc11 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:43 -0700 Subject: KVM: vmx: add dedicated utility to access guest's kernel_gs_base When lazy save/restore of MSR_KERNEL_GS_BASE was introduced[1], the MSR was intercepted in all modes and was only restored for the host when the guest is in 64-bit mode. So at the time, going through the full host restore prior to accessing MSR_KERNEL_GS_BASE was necessary to load host state and was not a significant waste of cycles. Later, MSR_KERNEL_GS_BASE interception was disabled for a 64-bit guest[2], and then unconditionally saved/restored for the host[3]. As a result, loading full host state is overkill for accesses to MSR_KERNEL_GS_BASE, and completely unnecessary when the guest is not in 64-bit mode. Add a dedicated utility to read/write the guest's MSR_KERNEL_GS_BASE (outside of the save/restore flow) to minimize the overhead incurred when accessing the MSR. When setting EFER, only decache the MSR if the new EFER will disable long mode. Removing out-of-band usage of vmx_load_host_state() also eliminates, or at least reduces, potential corner cases in its usage, which in turn will (hopefuly) make it easier to reason about future changes to the save/restore flow, e.g. optimization of saving host state. [1] commit 44ea2b1758d8 ("KVM: VMX: Move MSR_KERNEL_GS_BASE out of the vmx autoload msr area") [2] commit 5897297bc228 ("KVM: VMX: Don't intercept MSR_KERNEL_GS_BASE") [3] commit c8770e7ba63b ("KVM: VMX: Fix host userspace gsbase corruption") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 935b84f40f65..72c392525f61 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2772,13 +2772,31 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) load_fixmap_gdt(raw_smp_processor_id()); } -static void vmx_load_host_state(struct vcpu_vmx *vmx) +#ifdef CONFIG_X86_64 +static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx) { - preempt_disable(); - __vmx_load_host_state(vmx); - preempt_enable(); + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + rdmsrl(MSR_KERNEL_GS_BASE, + vmx->msr_guest_kernel_gs_base); + preempt_enable(); + } + return vmx->msr_guest_kernel_gs_base; } +static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data) +{ + if (is_long_mode(&vmx->vcpu)) { + preempt_disable(); + if (vmx->loaded_cpu_state) + wrmsrl(MSR_KERNEL_GS_BASE, data); + preempt_enable(); + } + vmx->msr_guest_kernel_gs_base = data; +} +#endif + static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); @@ -3857,8 +3875,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - msr_info->data = vmx->msr_guest_kernel_gs_base; + msr_info->data = vmx_read_guest_kernel_gs_base(vmx); break; #endif case MSR_EFER: @@ -3958,8 +3975,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(vmx); - vmx->msr_guest_kernel_gs_base = data; + vmx_write_guest_kernel_gs_base(vmx, data); break; #endif case MSR_IA32_SYSENTER_CS: @@ -4849,10 +4865,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) return; /* - * Force kernel_gs_base reloading before EFER changes, as control - * of this msr depends on is_long_mode(). + * MSR_KERNEL_GS_BASE is not intercepted when the guest is in + * 64-bit mode as a 64-bit kernel may frequently access the + * MSR. This means we need to manually save/restore the MSR + * when switching between guest and host state, but only if + * the guest is in 64-bit mode. Sync our cached value if the + * guest is transitioning to 32-bit mode and the CPU contains + * guest state, i.e. the cache is stale. */ - vmx_load_host_state(to_vmx(vcpu)); +#ifdef CONFIG_X86_64 + if (!(efer & EFER_LMA)) + (void)vmx_read_guest_kernel_gs_base(vmx); +#endif vcpu->arch.efer = efer; if (efer & EFER_LMA) { vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); -- cgit v1.2.1 From 6d6095bd2c915f4c2ba6f25e7d16e38b06c904de Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:44 -0700 Subject: KVM: vmx: rename __vmx_load_host_state() and vmx_save_host_state() Now that the vmx_load_host_state() wrapper is gone, i.e. the only time we call the core functions is when we're actually about to switch between guest/host, rename the functions that handle lazy state switching to vmx_prepare_switch_to_{guest,host}_state() to better document the full extent of their functionality. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 72c392525f61..ab27f9734ac0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2661,7 +2661,7 @@ static unsigned long segment_base(u16 selector) } #endif -static void vmx_save_host_state(struct kvm_vcpu *vcpu) +static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); #ifdef CONFIG_X86_64 @@ -2735,7 +2735,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmx->guest_msrs[i].mask); } -static void __vmx_load_host_state(struct vcpu_vmx *vmx) +static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { if (!vmx->loaded_cpu_state) return; @@ -2938,7 +2938,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { vmx_vcpu_pi_put(vcpu); - __vmx_load_host_state(to_vmx(vcpu)); + vmx_prepare_switch_to_host(to_vmx(vcpu)); } static bool emulation_required(struct kvm_vcpu *vcpu) @@ -6099,8 +6099,8 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 /* * Load null selectors, so we can avoid reloading them in - * __vmx_load_host_state(), in case userspace uses the null selectors - * too (the expected case). + * vmx_prepare_switch_to_host(), in case userspace uses + * the null selectors too (the expected case). */ vmcs_write16(HOST_DS_SELECTOR, 0); vmcs_write16(HOST_ES_SELECTOR, 0); @@ -10565,9 +10565,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * The sysexit path does not restore ds/es, so we must set them to * a reasonable value ourselves. * - * We can't defer this to vmx_load_host_state() since that function - * may be executed in interrupt context, which saves and restore segments - * around it, nullifying its effect. + * We can't defer this to vmx_prepare_switch_to_host() since that + * function may be executed in interrupt context, which saves and + * restore segments around it, nullifying its effect. */ loadsegment(ds, __USER_DS); loadsegment(es, __USER_DS); @@ -11668,7 +11668,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) * Set host-state according to L0's settings (vmcs12 is irrelevant here) * Some constant fields are set here by vmx_set_constant_host_state(). * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest() + * is called. */ vmx_set_constant_host_state(vmx); @@ -13707,7 +13708,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .vcpu_free = vmx_free_vcpu, .vcpu_reset = vmx_vcpu_reset, - .prepare_guest_switch = vmx_save_host_state, + .prepare_guest_switch = vmx_prepare_switch_to_guest, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, -- cgit v1.2.1 From fd1ec7723fbd560f924769b43cbdfe82dfd6a98e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:45 -0700 Subject: KVM: nVMX: remove a misleading comment regarding vmcs02 fields prepare_vmcs02() has an odd comment that says certain fields are "not in vmcs02". AFAICT the intent of the comment is to document that various VMCS fields are not handled by prepare_vmcs02(), e.g. HOST_{FS,GS}_{BASE,SELECTOR}. While technically true, the comment is misleading, e.g. it can lead the reader to think that KVM never writes those fields to vmcs02. Remove the comment altogether as the handling of FS and GS is not specific to nested VMX, and GUEST_PML_INDEX has been written by prepare_vmcs02() since commit "4e59516a12a6 (kvm: vmx: ensure VMCS is current while enabling PML)" Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ab27f9734ac0..a135c91d44f8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11741,11 +11741,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); - /* - * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, - * HOST_FS_BASE, HOST_GS_BASE. - */ - if (vmx->nested.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); -- cgit v1.2.1 From e920de8507c6c8760c775cd718627e7cbf57c3b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:46 -0700 Subject: KVM: vmx: compute need to reload FS/GS/LDT on demand Remove fs_reload_needed and gs_ldt_reload_needed from host_state and instead compute whether we need to reload various state at the time we actually do the reload. The state that is tracked by the *_reload_needed variables is not any more volatile than the trackers themselves. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a135c91d44f8..c8a583ff7bf2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -824,8 +824,6 @@ struct vcpu_vmx { #ifdef CONFIG_X86_64 u16 ds_sel, es_sel; #endif - int gs_ldt_reload_needed; - int fs_reload_needed; } host_state; struct { int vm86_active; @@ -2681,7 +2679,6 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * allow segment selectors with cpl > 0 or ti == 1. */ vmx->host_state.ldt_sel = kvm_read_ldt(); - vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; #ifdef CONFIG_X86_64 savesegment(ds, vmx->host_state.ds_sel); @@ -2711,20 +2708,15 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) #endif vmx->host_state.fs_sel = fs_sel; - if (!(fs_sel & 7)) { + if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); - vmx->host_state.fs_reload_needed = 0; - } else { + else vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.fs_reload_needed = 1; - } vmx->host_state.gs_sel = gs_sel; if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else { + else vmcs_write16(HOST_GS_SELECTOR, 0); - vmx->host_state.gs_ldt_reload_needed = 1; - } vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, gs_base); @@ -2749,7 +2741,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.gs_ldt_reload_needed) { + if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) { kvm_load_ldt(vmx->host_state.ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(vmx->host_state.gs_sel); @@ -2757,7 +2749,7 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) loadsegment(gs, vmx->host_state.gs_sel); #endif } - if (vmx->host_state.fs_reload_needed) + if (vmx->host_state.fs_sel & 7) loadsegment(fs, vmx->host_state.fs_sel); #ifdef CONFIG_X86_64 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { -- cgit v1.2.1 From d7ee039e2bab6341cdabf42d3036063cf91b40ea Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:47 -0700 Subject: KVM: vmx: move struct host_state usage to struct loaded_vmcs Make host_state a property of a loaded_vmcs so that it can be used as a cache of the VMCS fields, e.g. to lazily VMWRITE the corresponding VMCS field. Treating host_state as a cache does not work if it's not VMCS specific as the cache would become incoherent when switching between vmcs01 and vmcs02. Move vmcs_host_cr3 and vmcs_host_cr4 into host_state. Explicitly zero out host_state when allocating a new VMCS for a loaded_vmcs. Unlike the pre-existing vmcs_host_cr{3,4} usage, the segment information is not guaranteed to be (re)initialized when running a new nested VMCS, e.g. HOST_FS_BASE is not written in vmx_set_constant_host_state(). Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 72 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c8a583ff7bf2..d0d2374a3170 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -218,6 +218,21 @@ struct vmcs { char data[0]; }; +/* + * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT + * and whose values change infrequently, but are not constant. I.e. this is + * used as a write-through cache of the corresponding VMCS fields. + */ +struct vmcs_host_state { + unsigned long cr3; /* May not match real cr3 */ + unsigned long cr4; /* May not match real cr4 */ + + u16 fs_sel, gs_sel, ldt_sel; +#ifdef CONFIG_X86_64 + u16 ds_sel, es_sel; +#endif +}; + /* * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs @@ -229,14 +244,13 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; - unsigned long vmcs_host_cr3; /* May not match real cr3 */ - unsigned long vmcs_host_cr4; /* May not match real cr4 */ /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; s64 vnmi_blocked_time; unsigned long *msr_bitmap; struct list_head loaded_vmcss_on_cpu_link; + struct vmcs_host_state host_state; }; struct shared_msr_entry { @@ -819,12 +833,6 @@ struct vcpu_vmx { struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; } msr_autoload; - struct { - u16 fs_sel, gs_sel, ldt_sel; -#ifdef CONFIG_X86_64 - u16 ds_sel, es_sel; -#endif - } host_state; struct { int vm86_active; ulong save_rflags; @@ -2662,6 +2670,7 @@ static unsigned long segment_base(u16 selector) static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs_host_state *host_state; #ifdef CONFIG_X86_64 int cpu = raw_smp_processor_id(); #endif @@ -2673,16 +2682,17 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) return; vmx->loaded_cpu_state = vmx->loaded_vmcs; + host_state = &vmx->loaded_cpu_state->host_state; /* * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = kvm_read_ldt(); + host_state->ldt_sel = kvm_read_ldt(); #ifdef CONFIG_X86_64 - savesegment(ds, vmx->host_state.ds_sel); - savesegment(es, vmx->host_state.es_sel); + savesegment(ds, host_state->ds_sel); + savesegment(es, host_state->es_sel); gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { @@ -2707,12 +2717,12 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - vmx->host_state.fs_sel = fs_sel; + host_state->fs_sel = fs_sel; if (!(fs_sel & 7)) vmcs_write16(HOST_FS_SELECTOR, fs_sel); else vmcs_write16(HOST_FS_SELECTOR, 0); - vmx->host_state.gs_sel = gs_sel; + host_state->gs_sel = gs_sel; if (!(gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, gs_sel); else @@ -2729,10 +2739,13 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) { + struct vmcs_host_state *host_state; + if (!vmx->loaded_cpu_state) return; WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs); + host_state = &vmx->loaded_cpu_state->host_state; ++vmx->vcpu.stat.host_state_reload; vmx->loaded_cpu_state = NULL; @@ -2741,20 +2754,20 @@ static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx) if (is_long_mode(&vmx->vcpu)) rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); #endif - if (vmx->host_state.ldt_sel || (vmx->host_state.gs_sel & 7)) { - kvm_load_ldt(vmx->host_state.ldt_sel); + if (host_state->ldt_sel || (host_state->gs_sel & 7)) { + kvm_load_ldt(host_state->ldt_sel); #ifdef CONFIG_X86_64 - load_gs_index(vmx->host_state.gs_sel); + load_gs_index(host_state->gs_sel); #else - loadsegment(gs, vmx->host_state.gs_sel); + loadsegment(gs, host_state->gs_sel); #endif } - if (vmx->host_state.fs_sel & 7) - loadsegment(fs, vmx->host_state.fs_sel); + if (host_state->fs_sel & 7) + loadsegment(fs, host_state->fs_sel); #ifdef CONFIG_X86_64 - if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { - loadsegment(ds, vmx->host_state.ds_sel); - loadsegment(es, vmx->host_state.es_sel); + if (unlikely(host_state->ds_sel | host_state->es_sel)) { + loadsegment(ds, host_state->ds_sel); + loadsegment(es, host_state->es_sel); } #endif invalidate_tss_limit(); @@ -4574,6 +4587,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) evmcs->hv_enlightenments_control.msr_bitmap = 1; } } + + memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); + return 0; out_vmcs: @@ -6080,12 +6096,12 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) */ cr3 = __read_cr3(); vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */ - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; /* Save the most likely value for this task's CR4 in the VMCS. */ cr4 = cr4_read_shadow(); vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ #ifdef CONFIG_X86_64 @@ -10356,15 +10372,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); - if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) { + if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { vmcs_writel(HOST_CR3, cr3); - vmx->loaded_vmcs->vmcs_host_cr3 = cr3; + vmx->loaded_vmcs->host_state.cr3 = cr3; } cr4 = cr4_read_shadow(); - if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) { + if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { vmcs_writel(HOST_CR4, cr4); - vmx->loaded_vmcs->vmcs_host_cr4 = cr4; + vmx->loaded_vmcs->host_state.cr4 = cr4; } /* When single-stepping over STI and MOV SS, we must clear the -- cgit v1.2.1 From f3bbc0dcedf50d29d2d6fdfb16c5ec3e2368837f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:48 -0700 Subject: KVM: vmx: always initialize HOST_{FS,GS}_BASE to zero during setup The HOST_{FS,GS}_BASE fields are guaranteed to be written prior to VMENTER, by way of vmx_prepare_switch_to_guest(). Initialize the fields to zero for 64-bit kernels instead of pulling the base values from their respective MSRs. In addition to eliminating two RDMSRs, vmx_prepare_switch_to_guest() can safely assume the initial value of the fields is zero in all cases. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d0d2374a3170..60bf9a95219f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6361,9 +6361,6 @@ static void ept_set_mmio_spte_mask(void) */ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) { -#ifdef CONFIG_X86_64 - unsigned long a; -#endif int i; if (enable_shadow_vmcs) { @@ -6418,15 +6415,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmx_set_constant_host_state(vmx); -#ifdef CONFIG_X86_64 - rdmsrl(MSR_FS_BASE, a); - vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ - rdmsrl(MSR_GS_BASE, a); - vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ -#else vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ -#endif if (cpu_has_vmx_vmfunc()) vmcs_write64(VM_FUNCTION_CONTROL, 0); -- cgit v1.2.1 From 8f21a0bbf36f58334695ae6e46c0cf906a217154 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:49 -0700 Subject: KVM: vmx: skip VMWRITE of HOST_{FS,GS}_SEL when possible On a 64-bit host, FS.sel and GS.sel are all but guaranteed to be 0, which in turn means they'll rarely change. Skip the VMWRITE for the associated VMCS fields when loading host state if the selector hasn't changed since the last VMWRITE. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 60bf9a95219f..4937e11f971a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2717,16 +2717,20 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = segment_base(gs_sel); #endif - host_state->fs_sel = fs_sel; - if (!(fs_sel & 7)) - vmcs_write16(HOST_FS_SELECTOR, fs_sel); - else - vmcs_write16(HOST_FS_SELECTOR, 0); - host_state->gs_sel = gs_sel; - if (!(gs_sel & 7)) - vmcs_write16(HOST_GS_SELECTOR, gs_sel); - else - vmcs_write16(HOST_GS_SELECTOR, 0); + if (unlikely(fs_sel != host_state->fs_sel)) { + if (!(fs_sel & 7)) + vmcs_write16(HOST_FS_SELECTOR, fs_sel); + else + vmcs_write16(HOST_FS_SELECTOR, 0); + host_state->fs_sel = fs_sel; + } + if (unlikely(gs_sel != host_state->gs_sel)) { + if (!(gs_sel & 7)) + vmcs_write16(HOST_GS_SELECTOR, gs_sel); + else + vmcs_write16(HOST_GS_SELECTOR, 0); + host_state->gs_sel = gs_sel; + } vmcs_writel(HOST_FS_BASE, fs_base); vmcs_writel(HOST_GS_BASE, gs_base); -- cgit v1.2.1 From 5e079c7ece1060491ef4a45414823162cce91b3d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 23 Jul 2018 12:32:50 -0700 Subject: KVM: vmx: skip VMWRITE of HOST_{FS,GS}_BASE when possible The host's FS.base and GS.base rarely change, e.g. ~0.1% of host/guest swaps on my system. Cache the last value written to the VMCS and skip the VMWRITE to the associated VMCS fields when loading host state if the value hasn't changed since the last VMWRITE. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4937e11f971a..b23ecc04ca51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -226,6 +226,8 @@ struct vmcs { struct vmcs_host_state { unsigned long cr3; /* May not match real cr3 */ unsigned long cr4; /* May not match real cr4 */ + unsigned long gs_base; + unsigned long fs_base; u16 fs_sel, gs_sel, ldt_sel; #ifdef CONFIG_X86_64 @@ -2731,9 +2733,14 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) vmcs_write16(HOST_GS_SELECTOR, 0); host_state->gs_sel = gs_sel; } - - vmcs_writel(HOST_FS_BASE, fs_base); - vmcs_writel(HOST_GS_BASE, gs_base); + if (unlikely(fs_base != host_state->fs_base)) { + vmcs_writel(HOST_FS_BASE, fs_base); + host_state->fs_base = fs_base; + } + if (unlikely(gs_base != host_state->gs_base)) { + vmcs_writel(HOST_GS_BASE, gs_base); + host_state->gs_base = gs_base; + } for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, -- cgit v1.2.1 From c2a4eadf7747a1359a80ede64e4ae0e0ba64ca08 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Tue, 24 Jul 2018 08:17:07 +0000 Subject: KVM/MMU: Combine flushing remote tlb in mmu_set_spte() mmu_set_spte() flushes remote tlbs for drop_parent_pte/drop_spte() and set_spte() separately. This may introduce redundant flush. This patch is to combine these flushes and check flush request after calling set_spte(). Signed-off-by: Lan Tianyu Reviewed-by: Junaid Shahid Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 22a7984c68a5..8f216321d33b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2901,6 +2901,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int rmap_count; int set_spte_ret; int ret = RET_PF_RETRY; + bool flush = false; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2917,12 +2918,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, child = page_header(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep); - kvm_flush_remote_tlbs(vcpu->kvm); + flush = true; } else was_rmapped = 1; } @@ -2934,7 +2935,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, ret = RET_PF_EMULATE; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } - if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH) + if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush) kvm_flush_remote_tlbs(vcpu->kvm); if (unlikely(is_mmio_spte(*sptep))) -- cgit v1.2.1 From ee6268ba3a6861b2806e569bff7fe91fbdf846dd Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Wed, 25 Jul 2018 16:32:14 +0800 Subject: KVM: x86: Skip pae_root shadow allocation if tdp enabled Considering the fact that the pae_root shadow is not needed when tdp is in use, skip the pae_root shadow page allocation to allow mmu creation even not being able to obtain memory from DMA32 zone when particular cgroup cpuset.mems or mempolicy control is applied. Signed-off-by: Liang Chen Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f216321d33b..f5aef52b148b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5341,6 +5341,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; + if (tdp_enabled) + return 0; + /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first -- cgit v1.2.1 From 74fec5b9dbaa5e6fe776f6c73e6c00fb23dca844 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 23 Jul 2018 12:31:21 +0000 Subject: KVM/x86: Move X86_CR4_OSXSAVE check into kvm_valid_sregs() X86_CR4_OSXSAVE check belongs to sregs check and so move into kvm_valid_sregs(). Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6cc29dd21519..6b974802cadb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8049,6 +8049,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8079,10 +8083,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) struct desc_ptr dt; int ret = -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - goto out; - if (kvm_valid_sregs(vcpu, sregs)) goto out; -- cgit v1.2.1 From 4180bf1b655a791a0a6ef93a2ffffc762722c782 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 23 Jul 2018 14:39:54 +0800 Subject: KVM: X86: Implement "send IPI" hypercall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using hypercall to send IPIs by one vmexit instead of one by one for xAPIC/x2APIC physical mode and one vmexit per-cluster for x2APIC cluster mode. Intel guest can enter x2apic cluster mode when interrupt remmaping is enabled in qemu, however, latest AMD EPYC still just supports xapic mode which can get great improvement by Exit-less IPIs. This patchset lets a guest send multicast IPIs, with at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode. Hardware: Xeon Skylake 2.5GHz, 2 sockets, 40 cores, 80 threads, the VM is 80 vCPUs, IPI microbenchmark(https://lkml.org/lkml/2017/12/19/141): x2apic cluster mode, vanilla Dry-run: 0, 2392199 ns Self-IPI: 6907514, 15027589 ns Normal IPI: 223910476, 251301666 ns Broadcast IPI: 0, 9282161150 ns Broadcast lock: 0, 8812934104 ns x2apic cluster mode, pv-ipi Dry-run: 0, 2449341 ns Self-IPI: 6720360, 15028732 ns Normal IPI: 228643307, 255708477 ns Broadcast IPI: 0, 7572293590 ns => 22% performance boost Broadcast lock: 0, 8316124651 ns x2apic physical mode, vanilla Dry-run: 0, 3135933 ns Self-IPI: 8572670, 17901757 ns Normal IPI: 226444334, 255421709 ns Broadcast IPI: 0, 19845070887 ns Broadcast lock: 0, 19827383656 ns x2apic physical mode, pv-ipi Dry-run: 0, 2446381 ns Self-IPI: 6788217, 15021056 ns Normal IPI: 219454441, 249583458 ns Broadcast IPI: 0, 7806540019 ns => 154% performance boost Broadcast lock: 0, 9143618799 ns Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/lapic.c | 40 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 3 +++ 4 files changed, 49 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 150937e64f63..c18958ef17d2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1457,6 +1457,10 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu); +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit); + void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7e042e3d47fd..7bcfa61375c0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | (1 << KVM_FEATURE_PV_TLB_FLUSH) | - (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | + (1 << KVM_FEATURE_PV_SEND_IPI); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b5cd8465d44f..f0d693122c24 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, + unsigned long ipi_bitmap_high, int min, + unsigned long icr, int op_64_bit) +{ + int i; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + struct kvm_lapic_irq irq = {0}; + int cluster_size = op_64_bit ? 64 : 32; + int count = 0; + + irq.vector = icr & APIC_VECTOR_MASK; + irq.delivery_mode = icr & APIC_MODE_MASK; + irq.level = (icr & APIC_INT_ASSERT) != 0; + irq.trig_mode = icr & APIC_INT_LEVELTRIG; + + if (icr & APIC_DEST_MASK) + return -KVM_EINVAL; + if (icr & APIC_SHORT_MASK) + return -KVM_EINVAL; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + /* Bits above cluster_size are masked in the caller. */ + for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + min += cluster_size; + for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, &irq, NULL); + } + + rcu_read_unlock(); + return count; +} + static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b974802cadb..3c83711c0ebe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6802,6 +6802,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_CLOCK_PAIRING: ret = kvm_pv_clock_pairing(vcpu, a0, a1); break; + case KVM_HC_SEND_IPI: + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; #endif default: ret = -KVM_ENOSYS; -- cgit v1.2.1 From d63bae079b6426afc998c5ea76d9cde8d8c98303 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 23 Jul 2018 14:39:51 +0800 Subject: KVM: X86: Add kvm hypervisor init time platform setup callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add kvm hypervisor init time platform setup callback which will be used to replace native apic hooks by pararvirtual hooks. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 575c9a51e6ba..39d79720380f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -624,12 +624,22 @@ static uint32_t __init kvm_detect(void) return kvm_cpuid_base(); } +static void __init kvm_apic_init(void) +{ +} + +static void __init kvm_init_platform(void) +{ + x86_platform.apic_post_init = kvm_apic_init; +} + const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, }; static __init int activate_jump_labels(void) -- cgit v1.2.1 From aaffcfd1e82d3378538408d0310b7424b98d8f81 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 23 Jul 2018 14:39:52 +0800 Subject: KVM: X86: Implement PV IPIs in linux guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement paravirtual apic hooks to enable PV IPIs for KVM if the "send IPI" hypercall is available. The hypercall lets a guest send IPIs, with at most 128 destinations per hypercall in 64-bit mode and 64 vCPUs per hypercall in 32-bit mode. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Vitaly Kuznetsov Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 96 ++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 0ede697c3961..19980ec1a316 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -28,6 +28,7 @@ #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 #define KVM_FEATURE_ASYNC_PF_VMEXIT 10 +#define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 39d79720380f..62cbd089f709 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -454,6 +454,98 @@ static void __init sev_map_percpu_data(void) } #ifdef CONFIG_SMP +#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) + +static void __send_ipi_mask(const struct cpumask *mask, int vector) +{ + unsigned long flags; + int cpu, apic_id, icr; + int min = 0, max = 0; +#ifdef CONFIG_X86_64 + __uint128_t ipi_bitmap = 0; +#else + u64 ipi_bitmap = 0; +#endif + + if (cpumask_empty(mask)) + return; + + local_irq_save(flags); + + switch (vector) { + default: + icr = APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr = APIC_DM_NMI; + break; + } + + for_each_cpu(cpu, mask) { + apic_id = per_cpu(x86_cpu_to_apicid, cpu); + if (!ipi_bitmap) { + min = max = apic_id; + } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { + ipi_bitmap <<= min - apic_id; + min = apic_id; + } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + max = apic_id < max ? max : apic_id; + } else { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + min = max = apic_id; + ipi_bitmap = 0; + } + __set_bit(apic_id - min, (unsigned long *)&ipi_bitmap); + } + + if (ipi_bitmap) { + kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, + (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); + } + + local_irq_restore(flags); +} + +static void kvm_send_ipi_mask(const struct cpumask *mask, int vector) +{ + __send_ipi_mask(mask, vector); +} + +static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) +{ + unsigned int this_cpu = smp_processor_id(); + struct cpumask new_mask; + const struct cpumask *local_mask; + + cpumask_copy(&new_mask, mask); + cpumask_clear_cpu(this_cpu, &new_mask); + local_mask = &new_mask; + __send_ipi_mask(local_mask, vector); +} + +static void kvm_send_ipi_allbutself(int vector) +{ + kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); +} + +static void kvm_send_ipi_all(int vector) +{ + __send_ipi_mask(cpu_online_mask, vector); +} + +/* + * Set the IPI entry points + */ +static void kvm_setup_pv_ipi(void) +{ + apic->send_IPI_mask = kvm_send_ipi_mask; + apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; + apic->send_IPI_allbutself = kvm_send_ipi_allbutself; + apic->send_IPI_all = kvm_send_ipi_all; + pr_info("KVM setup pv IPIs\n"); +} + static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) { native_smp_prepare_cpus(max_cpus); @@ -626,6 +718,10 @@ static uint32_t __init kvm_detect(void) static void __init kvm_apic_init(void) { +#if defined(CONFIG_SMP) + if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + kvm_setup_pv_ipi(); +#endif } static void __init kvm_init_platform(void) -- cgit v1.2.1 From fd8ca6dac9b45db8503cf508880edd63e039e2f2 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 6 Aug 2018 16:42:49 +0200 Subject: KVM/x86: Use CC_SET()/CC_OUT in arch/x86/kvm/vmx.c Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in arch/x86/kvm/vmx.c. Signed-off-by: Uros Bizjak [Mark error paths as unlikely while touching this. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b23ecc04ca51..16f9373c01de 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -38,6 +38,7 @@ #include "kvm_cache_regs.h" #include "x86.h" +#include #include #include #include @@ -1916,11 +1917,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 rsvd : 48; u64 gva; } operand = { vpid, 0, gva }; + bool error; - asm volatile (__ex(ASM_VMX_INVVPID) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:" - : : "a"(&operand), "c"(ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na) + : CC_OUT(na) (error) : "a"(&operand), "c"(ext) + : "memory"); + BUG_ON(error); } static inline void __invept(int ext, u64 eptp, gpa_t gpa) @@ -1928,11 +1930,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) struct { u64 eptp, gpa; } operand = {eptp, gpa}; + bool error; - asm volatile (__ex(ASM_VMX_INVEPT) - /* CF==1 or ZF==1 --> rc = -1 */ - "; ja 1f ; ud2 ; 1:\n" - : : "a" (&operand), "c" (ext) : "cc", "memory"); + asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na) + : CC_OUT(na) (error) : "a" (&operand), "c" (ext) + : "memory"); + BUG_ON(error); } static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) @@ -1948,12 +1951,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) static void vmcs_clear(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", vmcs, phys_addr); } @@ -1970,15 +1973,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) static void vmcs_load(struct vmcs *vmcs) { u64 phys_addr = __pa(vmcs); - u8 error; + bool error; if (static_branch_unlikely(&enable_evmcs)) return evmcs_load(phys_addr); - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc", "memory"); - if (error) + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na) + : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr) + : "memory"); + if (unlikely(error)) printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", vmcs, phys_addr); } @@ -2203,10 +2206,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value) static __always_inline void __vmcs_writel(unsigned long field, unsigned long value) { - u8 error; + bool error; - asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" - : "=q"(error) : "a"(value), "d"(field) : "cc"); + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na) + : CC_OUT(na) (error) : "a"(value), "d"(field)); if (unlikely(error)) vmwrite_error(field, value); } -- cgit v1.2.1 From 28a1f3ac1d0c8558ee4453d9634dad891a6e922e Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Tue, 14 Aug 2018 10:15:34 -0700 Subject: kvm: x86: Set highest physical address bits in non-present/reserved SPTEs Always set the 5 upper-most supported physical address bits to 1 for SPTEs that are marked as non-present or reserved, to make them unusable for L1TF attacks from the guest. Currently, this just applies to MMIO SPTEs. (We do not need to mark PTEs that are completely 0 as physical page 0 is already reserved.) This allows mitigation of L1TF without disabling hyper-threading by using shadow paging mode instead of EPT. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 43 ++++++++++++++++++++++++++++++++++++++----- arch/x86/kvm/x86.c | 8 ++++++-- 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f5aef52b148b..27c2ab079a76 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -238,6 +238,17 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | PT64_EPT_EXECUTABLE_MASK; static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; +/* + * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order + * to guard against L1TF attacks. + */ +static u64 __read_mostly shadow_nonpresent_or_rsvd_mask; + +/* + * The number of high-order 1 bits to use in the mask above. + */ +static const u64 shadow_nonpresent_or_rsvd_mask_len = 5; + static void mmu_spte_set(u64 *sptep, u64 spte); static union kvm_mmu_page_role kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); @@ -327,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, { unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); + u64 gpa = gfn << PAGE_SHIFT; access &= ACC_WRITE_MASK | ACC_USER_MASK; - mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT; + mask |= shadow_mmio_value | access; + mask |= gpa | shadow_nonpresent_or_rsvd_mask; + mask |= (gpa & shadow_nonpresent_or_rsvd_mask) + << shadow_nonpresent_or_rsvd_mask_len; trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); @@ -342,8 +357,14 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; - return (spte & ~mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask | + shadow_nonpresent_or_rsvd_mask; + u64 gpa = spte & ~mask; + + gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len) + & shadow_nonpresent_or_rsvd_mask; + + return gpa >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) @@ -400,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_reset_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; @@ -410,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void) shadow_mmio_mask = 0; shadow_present_mask = 0; shadow_acc_track_mask = 0; + + /* + * If the CPU has 46 or less physical address bits, then set an + * appropriate mask to guard against L1TF attacks. Otherwise, it is + * assumed that the CPU is not vulnerable to L1TF. + */ + if (boot_cpu_data.x86_phys_bits < + 52 - shadow_nonpresent_or_rsvd_mask_len) + shadow_nonpresent_or_rsvd_mask = + rsvd_bits(boot_cpu_data.x86_phys_bits - + shadow_nonpresent_or_rsvd_mask_len, + boot_cpu_data.x86_phys_bits - 1); } static int is_cpuid_PSE36(void) @@ -5819,7 +5852,7 @@ int kvm_mmu_module_init(void) { int ret = -ENOMEM; - kvm_mmu_clear_all_pte_masks(); + kvm_mmu_reset_all_pte_masks(); pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3c83711c0ebe..d294983ee1c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6536,8 +6536,12 @@ static void kvm_set_mmio_spte_mask(void) * Set the reserved bits and the present bit of an paging-structure * entry to generate page fault with PFER.RSV = 1. */ - /* Mask the reserved physical address bits. */ - mask = rsvd_bits(maxphyaddr, 51); + + /* + * Mask the uppermost physical address bit, which would be reserved as + * long as the supported physical address width is less than 52. + */ + mask = 1ull << 51; /* Set the present bit. */ mask |= 1ull; -- cgit v1.2.1