diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/cpufeatures.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 25 | ||||
-rw-r--r-- | arch/x86/include/asm/kvmclock.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 28 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm_para.h | 9 | ||||
-rw-r--r-- | arch/x86/kernel/fpu/xstate.c | 1 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 19 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 29 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 135 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 12 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 509 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 81 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 161 |
20 files changed, 743 insertions, 330 deletions
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index eafee3161d1c..d9d7136edf05 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -288,6 +288,7 @@ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ @@ -320,5 +321,4 @@ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ - #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index e9cd7befcb76..3e8c287090e4 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt); void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt); +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7066dc1a7e9..417502cf42b6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -115,7 +115,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64 -#define KVM_MMU_HASH_SHIFT 10 +#define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 @@ -208,6 +208,13 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting + * with the SVE bit in EPT PTEs. + */ +#define SPTE_SPECIAL_MASK (1ULL << 62) + /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -668,6 +675,9 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; + + /* GPA available (AMD only) */ + bool gpa_available; }; struct kvm_lpage_info { @@ -716,6 +726,12 @@ struct kvm_hv { HV_REFERENCE_TSC_PAGE tsc_ref; }; +enum kvm_irqchip_mode { + KVM_IRQCHIP_NONE, + KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ + KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -788,7 +804,7 @@ struct kvm_arch { u64 disabled_quirks; - bool irqchip_split; + enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; @@ -815,6 +831,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong max_mmu_page_hash_collisions; }; struct kvm_vcpu_stat { @@ -844,6 +861,7 @@ struct kvm_vcpu_stat { u64 hypercalls; u64 irq_injections; u64 nmi_injections; + u64 req_event; }; struct x86_instruction_info; @@ -1050,7 +1068,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h new file mode 100644 index 000000000000..f260bef63591 --- /dev/null +++ b/arch/x86/include/asm/kvmclock.h @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_KVM_CLOCK_H +#define _ASM_X86_KVM_CLOCK_H + +extern struct clocksource kvm_clock; + +#endif /* _ASM_X86_KVM_CLOCK_H */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2b5b2d4b924e..cc54b7026567 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -467,8 +467,16 @@ enum vmcs_field { #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) -#define VMX_EPT_ACCESS_BIT (1ull << 8) -#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) +#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ + VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) +#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) + +/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ +#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ + VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul @@ -500,6 +508,22 @@ struct vmx_msr_entry { #define ENTRY_FAIL_VMCS_LINK_PTR 4 /* + * Exit Qualifications for EPT Violations + */ +#define EPT_VIOLATION_ACC_READ_BIT 0 +#define EPT_VIOLATION_ACC_WRITE_BIT 1 +#define EPT_VIOLATION_ACC_INSTR_BIT 2 +#define EPT_VIOLATION_READABLE_BIT 3 +#define EPT_VIOLATION_WRITABLE_BIT 4 +#define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) +#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) +#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) +#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) +#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) +#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) + +/* * VM-instruction error numbers */ enum vm_instruction_error_number { diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 1421a6585126..cff0bb6556f8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -50,6 +50,15 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_CLOCK_PAIRING_WALLCLOCK 0 +struct kvm_clock_pairing { + __s64 sec; + __s64 nsec; + __u64 tsc; + __u32 flags; + __u32 pad[9]; +}; + #define KVM_STEAL_ALIGNMENT_BITS 5 #define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) #define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1d7770447b3e..35f7024aace5 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -78,6 +78,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); + setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..995fa260a6da 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -28,6 +28,7 @@ #include <asm/x86_init.h> #include <asm/reboot.h> +#include <asm/kvmclock.h> static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; @@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) { return hv_clock; } +EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va); /* * The wallclock is the time of day when we booted. Since then, some time may @@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void) return ret; } -static struct clocksource kvm_clock = { +struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, .rating = 400, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +EXPORT_SYMBOL_GPL(kvm_clock); int kvm_register_clock(char *txt) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e85f6bd7b9d5..c0e2036217ad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -383,7 +383,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -861,12 +861,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) if (!best) best = check_cpuid_limit(vcpu, function, index); - /* - * Perfmon not yet supported for L2 guest. - */ - if (is_guest_mode(vcpu) && function == 0xa) - best = NULL; - if (best) { *eax = best->eax; *ebx = best->ebx; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cedbba0f3402..45c7306c8780 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -173,6 +173,7 @@ #define NearBranch ((u64)1 << 52) /* Near branches */ #define No16 ((u64)1 << 53) /* No 16 bit operand */ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ +#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4298,7 +4299,7 @@ static const struct opcode group1[] = { }; static const struct opcode group1A[] = { - I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N, + I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N, }; static const struct opcode group2[] = { @@ -4336,7 +4337,7 @@ static const struct opcode group5[] = { I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), - I(SrcMem | Stack, em_push), D(Undefined), + I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), }; static const struct opcode group6[] = { @@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = { /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), - I2bv(SrcSI | DstDI | Mov | String, em_mov), - F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r), + I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov), + F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r), /* 0xA8 - 0xAF */ F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), @@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt) { writeback_registers(ctxt); } + +bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->rep_prefix && (ctxt->d & String)) + return false; + + if (ctxt->d & TwoMemOp) + return false; + + return true; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..08b27e0c7b71 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return -ENOENT; memset(&irq, 0, sizeof(irq)); - irq.dest_id = kvm_apic_id(vcpu->arch.apic); + irq.shorthand = APIC_DEST_SELF; irq.dest_mode = APIC_DEST_PHYSICAL; irq.delivery_mode = APIC_DM_FIXED; irq.vector = vector; irq.level = 1; - ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL); + ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL); trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret); return ret; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 7cc2360f1848..73ea24d4f119 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = { .write = picdev_eclr_write, }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm) +int kvm_pic_init(struct kvm *kvm) { struct kvm_pic *s; int ret; s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) - return NULL; + return -ENOMEM; spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; @@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); - return s; + kvm->arch.vpic = s; + + return 0; fail_unreg_1: kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); @@ -648,13 +650,17 @@ fail_unlock: kfree(s); - return NULL; + return ret; } -void kvm_destroy_pic(struct kvm_pic *vpic) +void kvm_pic_destroy(struct kvm *kvm) { + struct kvm_pic *vpic = kvm->arch.vpic; + kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + + kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 035731eb3897..40d5b2cf6061 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -73,8 +73,8 @@ struct kvm_pic { unsigned long irq_states[PIC_NUM_PINS]; }; -struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_destroy_pic(struct kvm_pic *vpic); +int kvm_pic_init(struct kvm *kvm); +void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); @@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm) static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_split; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; } -static inline int irqchip_in_kernel(struct kvm *kvm) +static inline int irqchip_kernel(struct kvm *kvm) { - struct kvm_pic *vpic = pic_irqchip(kvm); - bool ret; + return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; +} - ret = (vpic != NULL); - ret |= irqchip_split(kvm); +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; - /* Read vpic before kvm->irq_routing. */ + /* Matches with wmb after initializing kvm->irq_routing. */ smp_rmb(); return ret; } diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6c0191615f23..b96d3893f121 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_pic *pic = pic_irqchip(kvm); - - /* - * XXX: rejecting pic routes when pic isn't in use would be better, - * but the default routing table is installed while kvm->arch.vpic is - * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE. - */ - if (!pic) - return -1; - return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, bool line_status) { struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) - return -1; - return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level, line_status); } @@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm, case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; switch (ue->u.irqchip.irqchip) { - case KVM_IRQCHIP_PIC_MASTER: - e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; - break; case KVM_IRQCHIP_PIC_SLAVE: + delta = 8; + /* fall through */ + case KVM_IRQCHIP_PIC_MASTER: + if (!pic_in_kernel(kvm)) + goto out; + e->set = kvm_set_pic_irq; max_pin = PIC_NUM_PINS; - delta = 8; break; case KVM_IRQCHIP_IOAPIC: + if (!ioapic_in_kernel(kvm)) + goto out; + max_pin = KVM_IOAPIC_NUM_PINS; e->set = kvm_set_ioapic_irq; break; @@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm) void kvm_arch_post_irq_routing_update(struct kvm *kvm) { - if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm)) + if (!irqchip_split(kvm)) return; kvm_make_scan_ioapic_request(kvm); } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2f6ef5121a4c..33b799fd3a6e 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + +static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) +{ + return apic->vcpu->vcpu_id; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - u32 max_id = 255; + u32 max_id = 255; /* enough space for any xAPIC ID */ mutex_lock(&kvm->arch.apic_map_lock); kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) - max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic)); new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); @@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_lapic *apic = vcpu->arch.apic; struct kvm_lapic **cluster; u16 mask; - u32 ldr, aid; + u32 ldr; + u8 xapic_id; + u32 x2apic_id; if (!kvm_apic_present(vcpu)) continue; - aid = kvm_apic_id(apic); - ldr = kvm_lapic_get_reg(apic, APIC_LDR); + xapic_id = kvm_xapic_id(apic); + x2apic_id = kvm_x2apic_id(apic); - if (aid <= new->max_apic_id) - new->phys_map[aid] = apic; + /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */ + if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) && + x2apic_id <= new->max_apic_id) + new->phys_map[x2apic_id] = apic; + /* + * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around, + * prevent them from masking VCPUs with APIC ID <= 0xff. + */ + if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id]) + new->phys_map[xapic_id] = apic; + + ldr = kvm_lapic_get_reg(apic, APIC_LDR); if (apic_x2apic_mode(apic)) { new->mode |= KVM_APIC_MODE_X2APIC; @@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + WARN_ON_ONCE(id != apic->vcpu->vcpu_id); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); @@ -546,7 +570,15 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } -static void apic_update_ppr(struct kvm_lapic *apic) +static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) +{ + int highest_irr = apic_find_highest_irr(apic); + if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) + return -1; + return highest_irr; +} + +static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) { u32 tpr, isrv, ppr, old_ppr; int isr; @@ -564,12 +596,27 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - if (old_ppr != ppr) { + *new_ppr = ppr; + if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); - if (ppr < old_ppr) - kvm_make_request(KVM_REQ_EVENT, apic->vcpu); - } + + return ppr < old_ppr; +} + +static void apic_update_ppr(struct kvm_lapic *apic) +{ + u32 ppr; + + if (__apic_update_ppr(apic, &ppr) && + apic_has_interrupt_for_ppr(apic, ppr) != -1) + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} + +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu) +{ + apic_update_ppr(vcpu->arch.apic); } +EXPORT_SYMBOL_GPL(kvm_apic_update_ppr); static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) { @@ -579,10 +626,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda) { - if (apic_x2apic_mode(apic)) - return mda == X2APIC_BROADCAST; - - return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST; + return mda == (apic_x2apic_mode(apic) ? + X2APIC_BROADCAST : APIC_BROADCAST); } static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) @@ -591,9 +636,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda) return true; if (apic_x2apic_mode(apic)) - return mda == kvm_apic_id(apic); + return mda == kvm_x2apic_id(apic); + + /* + * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if + * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and + * this allows unique addressing of VCPUs with APIC ID over 0xff. + * The 0xff condition is needed because writeable xAPIC ID. + */ + if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic)) + return true; - return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic)); + return mda == kvm_xapic_id(apic); } static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) @@ -610,7 +664,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) && (logical_id & mda & 0xffff) != 0; logical_id = GET_APIC_LOGICAL_ID(logical_id); - mda = GET_APIC_DEST_FIELD(mda); switch (kvm_lapic_get_reg(apic, APIC_DFR)) { case APIC_DFR_FLAT: @@ -627,9 +680,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) /* The KVM local APIC implementation has two quirks: * - * - the xAPIC MDA stores the destination at bits 24-31, while this - * is not true of struct kvm_lapic_irq's dest_id field. This is - * just a quirk in the API and is not problematic. + * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs + * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID. + * KVM doesn't do that aliasing. * * - in-kernel IOAPIC messages have to be delivered directly to * x2APIC, because the kernel does not support interrupt remapping. @@ -645,13 +698,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; - bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && - !ipi && dest_id == APIC_BROADCAST && x2apic_mda) + !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target)) return X2APIC_BROADCAST; - return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); + return dest_id; } bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, @@ -1907,9 +1959,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug("%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_apic_id(apic), + vcpu, kvm_lapic_get_reg(apic, APIC_ID), vcpu->arch.apic_base, apic->base_address); } @@ -2021,17 +2073,13 @@ nomem: int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - int highest_irr; + u32 ppr; if (!apic_enabled(apic)) return -1; - apic_update_ppr(apic); - highest_irr = apic_find_highest_irr(apic); - if ((highest_irr == -1) || - ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI))) - return -1; - return highest_irr; + __apic_update_ppr(apic, &ppr); + return apic_has_interrupt_for_ppr(apic, ppr); } int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) @@ -2067,6 +2115,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) { int vector = kvm_apic_has_interrupt(vcpu); struct kvm_lapic *apic = vcpu->arch.apic; + u32 ppr; if (vector == -1) return -1; @@ -2078,13 +2127,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) * because the process would deliver it through the IDT. */ - apic_set_isr(vector, apic); - apic_update_ppr(apic); apic_clear_irr(vector, apic); - if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) { - apic_clear_isr(vector, apic); + /* + * For auto-EOI interrupts, there might be another pending + * interrupt above PPR, so check whether to raise another + * KVM_REQ_EVENT. + */ apic_update_ppr(apic); + } else { + /* + * For normal interrupts, PPR has been raised and there cannot + * be a higher-priority pending interrupt---except if there was + * a concurrent interrupt injection, but that would have + * triggered KVM_REQ_EVENT already. + */ + apic_set_isr(vector, apic); + __apic_update_ppr(apic, &ppr); } return vector; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff8039d61672..05abd837b78a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -73,6 +73,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, void __kvm_apic_update_irr(u32 *pir, void *regs); void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); @@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline u32 kvm_apic_id(struct kvm_lapic *apic) -{ - /* To avoid a race between apic_base and following APIC_ID update when - * switching to x2apic_mode, the x2apic mode returns initial x2apic id. - */ - if (apic_x2apic_mode(apic)) - return apic->vcpu->vcpu_id; - - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void wait_lapic_expire(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7012de4a1fed..2fd7586aad4d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -37,6 +37,8 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/hash.h> +#include <linux/kern_levels.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -129,6 +131,10 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +/* The mask for the R/X bits in EPT PTEs */ +#define PT64_EPT_READABLE_MASK 0x1ull +#define PT64_EPT_EXECUTABLE_MASK 0x4ull + #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -178,15 +184,40 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static u64 __read_mostly shadow_present_mask; +/* + * The mask/value to distinguish a PTE that has been marked not-present for + * access tracking purposes. + * The mask would be either 0 if access tracking is disabled, or + * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. + */ +static u64 __read_mostly shadow_acc_track_mask; +static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; + +/* + * The mask/shift to use for saving the original R/X bits when marking the PTE + * as not-present for access tracking purposes. We do not save the W bit as the + * PTEs being access tracked also need to be dirty tracked, so the W bit will be + * restored only when a write is attempted to the page. + */ +static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | + PT64_EPT_EXECUTABLE_MASK; +static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; + static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { - shadow_mmio_mask = mmio_mask; + shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); +static inline bool is_access_track_spte(u64 spte) +{ + /* Always false if shadow_acc_track_mask is zero. */ + return (spte & shadow_acc_track_mask) == shadow_acc_track_value; +} + /* * the low bit of the generation number is always presumed to be zero. * This disables mmio caching during memslot updates. The concept is @@ -284,17 +315,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, + u64 acc_track_mask) { + if (acc_track_mask != 0) + acc_track_mask |= SPTE_SPECIAL_MASK; + shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; shadow_present_mask = p_mask; + shadow_acc_track_mask = acc_track_mask; + WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); +void kvm_mmu_clear_all_pte_masks(void) +{ + shadow_user_mask = 0; + shadow_accessed_mask = 0; + shadow_dirty_mask = 0; + shadow_nx_mask = 0; + shadow_x_mask = 0; + shadow_mmio_mask = 0; + shadow_present_mask = 0; + shadow_acc_track_mask = 0; +} + static int is_cpuid_PSE36(void) { return 1; @@ -307,7 +356,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); + return (pte != 0) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -324,6 +373,11 @@ static int is_last_spte(u64 pte, int level) return 0; } +static bool is_executable_pte(u64 spte) +{ + return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask; +} + static kvm_pfn_t spte_to_pfn(u64 pte) { return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -473,7 +527,7 @@ retry: } #endif -static bool spte_is_locklessly_modifiable(u64 spte) +static bool spte_can_locklessly_be_made_writable(u64 spte) { return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); @@ -481,36 +535,38 @@ static bool spte_is_locklessly_modifiable(u64 spte) static bool spte_has_volatile_bits(u64 spte) { + if (!is_shadow_present_pte(spte)) + return false; + /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ - if (spte_is_locklessly_modifiable(spte)) + if (spte_can_locklessly_be_made_writable(spte) || + is_access_track_spte(spte)) return true; - if (!shadow_accessed_mask) - return false; - - if (!is_shadow_present_pte(spte)) - return false; - - if ((spte & shadow_accessed_mask) && - (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) - return false; + if (shadow_accessed_mask) { + if ((spte & shadow_accessed_mask) == 0 || + (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) + return true; + } - return true; + return false; } -static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_accessed_spte(u64 spte) { - return (old_spte & bit_mask) && !(new_spte & bit_mask); + return shadow_accessed_mask ? spte & shadow_accessed_mask + : !is_access_track_spte(spte); } -static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) +static bool is_dirty_spte(u64 spte) { - return (old_spte & bit_mask) != (new_spte & bit_mask); + return shadow_dirty_mask ? spte & shadow_dirty_mask + : spte & PT_WRITABLE_MASK; } /* Rules for using mmu_spte_set: @@ -525,25 +581,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) __set_spte(sptep, new_spte); } -/* Rules for using mmu_spte_update: - * Update the state bits, it means the mapped pfn is not changed. - * - * Whenever we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB, the return value indicates this - * case. +/* + * Update the SPTE (excluding the PFN), but do not track changes in its + * accessed/dirty status. */ -static bool mmu_spte_update(u64 *sptep, u64 new_spte) +static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) { u64 old_spte = *sptep; - bool ret = false; WARN_ON(!is_shadow_present_pte(new_spte)); if (!is_shadow_present_pte(old_spte)) { mmu_spte_set(sptep, new_spte); - return ret; + return old_spte; } if (!spte_has_volatile_bits(old_spte)) @@ -551,45 +601,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); + + return old_spte; +} + +/* Rules for using mmu_spte_update: + * Update the state bits, it means the mapped pfn is not changed. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. + * + * Returns true if the TLB needs to be flushed + */ +static bool mmu_spte_update(u64 *sptep, u64 new_spte) +{ + bool flush = false; + u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); + + if (!is_shadow_present_pte(old_spte)) + return false; + /* * For the spte updated out of mmu-lock is safe, since * we always atomically update it, see the comments in * spte_has_volatile_bits(). */ - if (spte_is_locklessly_modifiable(old_spte) && + if (spte_can_locklessly_be_made_writable(old_spte) && !is_writable_pte(new_spte)) - ret = true; - - if (!shadow_accessed_mask) { - /* - * We don't set page dirty when dropping non-writable spte. - * So do it now if the new spte is becoming non-writable. - */ - if (ret) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); - return ret; - } + flush = true; /* - * Flush TLB when accessed/dirty bits are changed in the page tables, + * Flush TLB when accessed/dirty states are changed in the page tables, * to guarantee consistency between TLB and page tables. */ - if (spte_is_bit_changed(old_spte, new_spte, - shadow_accessed_mask | shadow_dirty_mask)) - ret = true; - if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { + flush = true; kvm_set_pfn_accessed(spte_to_pfn(old_spte)); - if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + } + + if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { + flush = true; kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + } - return ret; + return flush; } /* * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. + * Returns non-zero if the PTE was previously valid. */ static int mmu_spte_clear_track_bits(u64 *sptep) { @@ -613,11 +680,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep) */ WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); - if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + if (is_accessed_spte(old_spte)) kvm_set_pfn_accessed(pfn); - if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask : - PT_WRITABLE_MASK)) + + if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); + return 1; } @@ -636,6 +704,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep) return __get_spte_lockless(sptep); } +static u64 mark_spte_for_access_track(u64 spte) +{ + if (shadow_accessed_mask != 0) + return spte & ~shadow_accessed_mask; + + if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) + return spte; + + /* + * Making an Access Tracking PTE will result in removal of write access + * from the PTE. So, verify that we will be able to restore the write + * access in the fast page fault path later on. + */ + WARN_ONCE((spte & PT_WRITABLE_MASK) && + !spte_can_locklessly_be_made_writable(spte), + "kvm: Writable SPTE is not locklessly dirty-trackable\n"); + + WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift), + "kvm: Access Tracking saved bit locations are not zero\n"); + + spte |= (spte & shadow_acc_track_saved_bits_mask) << + shadow_acc_track_saved_bits_shift; + spte &= ~shadow_acc_track_mask; + spte |= shadow_acc_track_value; + + return spte; +} + +/* Restore an acc-track PTE back to a regular PTE */ +static u64 restore_acc_track_spte(u64 spte) +{ + u64 new_spte = spte; + u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift) + & shadow_acc_track_saved_bits_mask; + + WARN_ON_ONCE(!is_access_track_spte(spte)); + + new_spte &= ~shadow_acc_track_mask; + new_spte &= ~(shadow_acc_track_saved_bits_mask << + shadow_acc_track_saved_bits_shift); + new_spte |= saved_bits; + + return new_spte; +} + +/* Returns the Accessed status of the PTE and resets it at the same time. */ +static bool mmu_spte_age(u64 *sptep) +{ + u64 spte = mmu_spte_get_lockless(sptep); + + if (!is_accessed_spte(spte)) + return false; + + if (shadow_accessed_mask) { + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); + } else { + /* + * Capture the dirty status of the page, so that it doesn't get + * lost when the SPTE is marked for access tracking. + */ + if (is_writable_pte(spte)) + kvm_set_pfn_dirty(spte_to_pfn(spte)); + + spte = mark_spte_for_access_track(spte); + mmu_spte_update_no_track(sptep, spte); + } + + return true; +} + static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { /* @@ -1212,7 +1352,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect) u64 spte = *sptep; if (!is_writable_pte(spte) && - !(pt_protect && spte_is_locklessly_modifiable(spte))) + !(pt_protect && spte_can_locklessly_be_made_writable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); @@ -1420,7 +1560,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + sptep, *sptep, gfn, level); need_flush = 1; @@ -1433,7 +1573,8 @@ restart: new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - new_spte &= ~shadow_accessed_mask; + + new_spte = mark_spte_for_access_track(new_spte); mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); @@ -1595,15 +1736,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, struct rmap_iterator uninitialized_var(iter); int young = 0; - BUG_ON(!shadow_accessed_mask); - - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - clear_bit((ffs(shadow_accessed_mask) - 1), - (unsigned long *)sptep); - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + young |= mmu_spte_age(sptep); trace_kvm_age_page(gfn, level, slot, young); return young; @@ -1615,24 +1749,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, { u64 *sptep; struct rmap_iterator iter; - int young = 0; /* - * If there's no access bit in the secondary pte set by the - * hardware it's up to gup-fast/gup to set the access bit in - * the primary pte or in the page structure. + * If there's no access bit in the secondary pte set by the hardware and + * fast access tracking is also not enabled, it's up to gup-fast/gup to + * set the access bit in the primary pte or in the page structure. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) goto out; - for_each_rmap_spte(rmap_head, &iter, sptep) { - if (*sptep & shadow_accessed_mask) { - young = 1; - break; - } - } + for_each_rmap_spte(rmap_head, &iter, sptep) + if (is_accessed_spte(*sptep)) + return 1; out: - return young; + return 0; } #define RMAP_RECYCLE_THRESHOLD 1000 @@ -1660,7 +1790,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) + if (!shadow_accessed_mask && !shadow_acc_track_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); @@ -1713,7 +1843,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); + return hash_64(gfn, KVM_MMU_HASH_SHIFT); } static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, @@ -1904,17 +2034,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, * since it has been deleted from active_mmu_pages but still can be found * at hast list. * - * for_each_gfn_valid_sp() has skipped that kind of pages. + * for_each_valid_sp() has skipped that kind of pages. */ -#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ +#define for_each_valid_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ - if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \ - || (_sp)->role.invalid) {} else + if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ + } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_gfn_valid_sp(_kvm, _sp, _gfn) \ - if ((_sp)->role.direct) {} else + for_each_valid_sp(_kvm, _sp, _gfn) \ + if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else /* @sp->gfn should be write-protected at the call site */ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2116,6 +2246,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; bool need_sync = false; bool flush = false; + int collisions = 0; LIST_HEAD(invalid_list); role = vcpu->arch.mmu.base_role; @@ -2130,7 +2261,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) { + for_each_valid_sp(vcpu->kvm, sp, gfn) { + if (sp->gfn != gfn) { + collisions++; + continue; + } + if (!need_sync && sp->unsync) need_sync = true; @@ -2153,7 +2289,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, __clear_sp_write_flooding_count(sp); trace_kvm_mmu_get_page(sp, false); - return sp; + goto out; } ++vcpu->kvm->stat.mmu_cache_miss; @@ -2183,6 +2319,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); +out: + if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) + vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; return sp; } @@ -2583,6 +2722,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_dirty_mask; } + if (speculative) + spte = mark_spte_for_access_track(spte); + set_pte: if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2636,7 +2778,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, pgprintk("%s: setting spte %llx\n", __func__, *sptep); pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", - *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, + *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, *sptep, sptep); if (!was_rmapped && is_large_pte(*sptep)) ++vcpu->kvm->stat.lpages; @@ -2869,33 +3011,43 @@ static bool page_fault_can_be_fast(u32 error_code) if (unlikely(error_code & PFERR_RSVD_MASK)) return false; + /* See if the page fault is due to an NX violation */ + if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) + == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) + return false; + /* - * #PF can be fast only if the shadow page table is present and it - * is caused by write-protect, that means we just need change the - * W bit of the spte which can be done out of mmu-lock. + * #PF can be fast if: + * 1. The shadow page table entry is not present, which could mean that + * the fault is potentially caused by access tracking (if enabled). + * 2. The shadow page table entry is present and the fault + * is caused by write-protect, that means we just need change the W + * bit of the spte which can be done out of mmu-lock. + * + * However, if access tracking is disabled we know that a non-present + * page must be a genuine page fault where we have to create a new SPTE. + * So, if access tracking is disabled, we return true only for write + * accesses to a present page. */ - if (!(error_code & PFERR_PRESENT_MASK) || - !(error_code & PFERR_WRITE_MASK)) - return false; - return true; + return shadow_acc_track_mask != 0 || + ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) + == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); } +/* + * Returns true if the SPTE was fixed successfully. Otherwise, + * someone else modified the SPTE from its original value. + */ static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *sptep, u64 spte) + u64 *sptep, u64 old_spte, u64 new_spte) { gfn_t gfn; WARN_ON(!sp->role.direct); /* - * The gfn of direct spte is stable since it is calculated - * by sp->gfn. - */ - gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - - /* * Theoretically we could also set dirty bit (and flush TLB) here in * order to eliminate unnecessary PML logging. See comments in * set_spte. But fast_page_fault is very unlikely to happen with PML @@ -2907,12 +3059,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * * Compare with set_spte where instead shadow_dirty_mask is set. */ - if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) + return false; + + if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) { + /* + * The gfn of direct spte is stable since it is + * calculated by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); kvm_vcpu_mark_page_dirty(vcpu, gfn); + } return true; } +static bool is_access_allowed(u32 fault_err_code, u64 spte) +{ + if (fault_err_code & PFERR_FETCH_MASK) + return is_executable_pte(spte); + + if (fault_err_code & PFERR_WRITE_MASK) + return is_writable_pte(spte); + + /* Fault was on Read access */ + return spte & PT_PRESENT_MASK; +} + /* * Return value: * - true: let the vcpu to access on the same address again. @@ -2923,8 +3096,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; - bool ret = false; + bool fault_handled = false; u64 spte = 0ull; + uint retry_count = 0; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return false; @@ -2933,66 +3107,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, return false; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) - if (!is_shadow_present_pte(spte) || iterator.level < level) + + do { + u64 new_spte; + + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || + iterator.level < level) + break; + + sp = page_header(__pa(iterator.sptep)); + if (!is_last_spte(spte, sp->role.level)) break; - /* - * If the mapping has been changed, let the vcpu fault on the - * same address again. - */ - if (!is_shadow_present_pte(spte)) { - ret = true; - goto exit; - } + /* + * Check whether the memory access that caused the fault would + * still cause it if it were to be performed right now. If not, + * then this is a spurious fault caused by TLB lazily flushed, + * or some other CPU has already fixed the PTE after the + * current CPU took the fault. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_access_allowed(error_code, spte)) { + fault_handled = true; + break; + } - sp = page_header(__pa(iterator.sptep)); - if (!is_last_spte(spte, sp->role.level)) - goto exit; + new_spte = spte; - /* - * Check if it is a spurious fault caused by TLB lazily flushed. - * - * Need not check the access of upper level table entries since - * they are always ACC_ALL. - */ - if (is_writable_pte(spte)) { - ret = true; - goto exit; - } + if (is_access_track_spte(spte)) + new_spte = restore_acc_track_spte(new_spte); - /* - * Currently, to simplify the code, only the spte write-protected - * by dirty-log can be fast fixed. - */ - if (!spte_is_locklessly_modifiable(spte)) - goto exit; + /* + * Currently, to simplify the code, write-protection can + * be removed in the fast path only if the SPTE was + * write-protected for dirty-logging or access tracking. + */ + if ((error_code & PFERR_WRITE_MASK) && + spte_can_locklessly_be_made_writable(spte)) + { + new_spte |= PT_WRITABLE_MASK; - /* - * Do not fix write-permission on the large spte since we only dirty - * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() - * that means other pages are missed if its slot is dirty-logged. - * - * Instead, we let the slow page fault path create a normal spte to - * fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). - */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) - goto exit; + /* + * Do not fix write-permission on the large spte. Since + * we only dirty the first page into the dirty-bitmap in + * fast_pf_fix_direct_spte(), other pages are missed + * if its slot has dirty logging enabled. + * + * Instead, we let the slow page fault path create a + * normal spte to fix the access. + * + * See the comments in kvm_arch_commit_memory_region(). + */ + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + break; + } + + /* Verify that the fault can be handled in the fast path */ + if (new_spte == spte || + !is_access_allowed(error_code, new_spte)) + break; + + /* + * Currently, fast page fault only works for direct mapping + * since the gfn is not stable for indirect shadow page. See + * Documentation/virtual/kvm/locking.txt to get more detail. + */ + fault_handled = fast_pf_fix_direct_spte(vcpu, sp, + iterator.sptep, spte, + new_spte); + if (fault_handled) + break; + + if (++retry_count > 4) { + printk_once(KERN_WARNING + "kvm: Fast #PF retrying more than 4 times.\n"); + break; + } + + } while (true); - /* - * Currently, fast page fault only works for direct mapping since - * the gfn is not stable for indirect shadow page. - * See Documentation/virtual/kvm/locking.txt to get more detail. - */ - ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); -exit: trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, - spte, ret); + spte, fault_handled); walk_shadow_page_lockless_end(vcpu); - return ret; + return fault_handled; } static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, @@ -5063,6 +5264,8 @@ static void mmu_destroy_caches(void) int kvm_mmu_module_init(void) { + kvm_mmu_clear_all_pte_masks(); + pte_list_desc_cache = kmem_cache_create("pte_list_desc", sizeof(struct pte_list_desc), 0, 0, NULL); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 08a4d3ab3455..d0414f054bdf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4182,6 +4182,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); + vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF); + if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a236decb81e4..7c3e42623090 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4953,7 +4953,7 @@ static bool vmx_get_enable_apicv(void) return enable_apicv; } -static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) +static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; @@ -4964,19 +4964,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmx->nested.pi_pending) { vmx->nested.pi_pending = false; if (!pi_test_and_clear_on(vmx->nested.pi_desc)) - return 0; + return; max_irr = find_last_bit( (unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr == 256) - return 0; + return; vapic_page = kmap(vmx->nested.virtual_apic_page); - if (!vapic_page) { - WARN_ON(1); - return -ENOMEM; - } __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); kunmap(vmx->nested.virtual_apic_page); @@ -4987,7 +4983,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_INTR_STATUS, status); } } - return 0; } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) @@ -5236,10 +5231,8 @@ static void ept_set_mmio_spte_mask(void) /* * EPT Misconfigurations can be generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). - * Also, magic bits (0x3ull << 62) is set to quickly identify mmio - * spte. */ - kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); + kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE); } #define VMX_XSS_EXIT_BITMAP 0 @@ -6152,7 +6145,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_apic_update_ppr(vcpu); return 1; } @@ -6374,14 +6367,20 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* it is a read fault? */ - error_code = (exit_qualification << 2) & PFERR_USER_MASK; - /* it is a write fault? */ - error_code |= exit_qualification & PFERR_WRITE_MASK; - /* It is a fetch fault? */ - error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; - /* ept page table is present? */ - error_code |= (exit_qualification & 0x38) != 0; + /* Is it a read fault? */ + error_code = (exit_qualification & EPT_VIOLATION_ACC_READ) + ? PFERR_USER_MASK : 0; + /* Is it a write fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE) + ? PFERR_WRITE_MASK : 0; + /* Is it a fetch fault? */ + error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR) + ? PFERR_FETCH_MASK : 0; + /* ept page table entry is present? */ + error_code |= (exit_qualification & + (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE | + EPT_VIOLATION_EXECUTABLE)) + ? PFERR_PRESENT_MASK : 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6572,6 +6571,19 @@ static void wakeup_handler(void) spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); } +void vmx_enable_tdp(void) +{ + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, + enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, + enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK); + + ept_set_mmio_spte_mask(); + kvm_enable_tdp(); +} + static __init int hardware_setup(void) { int r = -ENOMEM, i, msr; @@ -6697,16 +6709,9 @@ static __init int hardware_setup(void) /* SELF-IPI */ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); - if (enable_ept) { - kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, - (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, - (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK, - cpu_has_vmx_ept_execute_only() ? - 0ull : VMX_EPT_READABLE_MASK); - ept_set_mmio_spte_mask(); - kvm_enable_tdp(); - } else + if (enable_ept) + vmx_enable_tdp(); + else kvm_disable_tdp(); update_ple_window_actual_max(); @@ -7168,9 +7173,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) - return 1; - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7182,6 +7184,9 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) + return 1; + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -8191,8 +8196,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_TASK_SWITCH: return true; case EXIT_REASON_CPUID: - if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) - return false; return true; case EXIT_REASON_HLT: return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); @@ -9730,11 +9733,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, return false; } msr_bitmap_l1 = (unsigned long *)kmap(page); - if (!msr_bitmap_l1) { - nested_release_page_clean(page); - WARN_ON(1); - return false; - } memset(msr_bitmap_l0, 0xff, PAGE_SIZE); @@ -10696,7 +10694,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - return vmx_complete_nested_posted_interrupt(vcpu); + vmx_complete_nested_posted_interrupt(vcpu); + return 0; } static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d153be8929a6..500008f800dc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; @@ -1139,6 +1142,7 @@ struct pvclock_gtod_data { u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { @@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -3896,7 +3935,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3959,40 +3998,41 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); + goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); - kvm->arch.vpic = vpic; + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -4028,7 +4068,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4052,7 +4092,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4461,6 +4501,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4487,16 +4542,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4593,6 +4639,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5609,6 +5671,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) @@ -5923,9 +5988,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; @@ -6024,7 +6086,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6086,6 +6148,33 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + u64 cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6150,6 +6239,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; default: ret = -KVM_ENOSYS; break; @@ -6732,6 +6824,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; |