diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/iommu.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 49 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 476 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 14 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/mtrr.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 10 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 27 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 343 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 124 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 7 |
16 files changed, 632 insertions, 471 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7597b42a8a88..3235e0fe7792 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -13,7 +13,7 @@ */ #include <linux/kvm_host.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ @@ -366,7 +366,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index e17a74b1d852..35058c2c0eea 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -144,14 +144,6 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_RTM)); } -static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu) -{ - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->ebx & bit(X86_FEATURE_PCOMMIT)); -} - static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a2f24af3c999..4e95d3eb2955 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -22,7 +22,6 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" -#include <linux/module.h> #include <asm/kvm_emulate.h> #include <linux/stringify.h> #include <asm/debugreg.h> diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 4f2010c5feba..b181426f67b4 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c @@ -25,7 +25,7 @@ #include <linux/list.h> #include <linux/kvm_host.h> -#include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/pci.h> #include <linux/stat.h> #include <linux/iommu.h> diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 95fcc7b13866..60d91c9d160c 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -20,7 +20,7 @@ * */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/kvm_host.h> #include "irq.h" diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index dfb4c6476877..25810b144b58 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, return r; } -void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, +void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); + trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? + (u64)e->msi.address_hi << 32 : 0), + e->msi.data); irq->dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; + if (kvm->arch.x2apic_format) + irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); irq->vector = (e->msi.data & MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; @@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, } EXPORT_SYMBOL_GPL(kvm_set_msi_irq); +static inline bool kvm_msi_route_invalid(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e) +{ + return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff); +} + int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { struct kvm_lapic_irq irq; + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + if (!level) return -1; - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL); } @@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, if (unlikely(e->type != KVM_IRQ_ROUTING_MSI)) return -EWOULDBLOCK; - kvm_set_msi_irq(e, &irq); + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + + kvm_set_msi_irq(kvm, e, &irq); if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL)) return r; @@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e, return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint); } -int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { int r = -EINVAL; @@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, e->msi.address_lo = ue->u.msi.address_lo; e->msi.address_hi = ue->u.msi.address_hi; e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + goto out; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm->arch.nr_reserved_ioapic_pins); for (i = 0; i < nr_ioapic_pins; ++i) { hlist_for_each_entry(entry, &table->map[i], link) { - u32 dest_id, dest_mode; - bool level; + struct kvm_lapic_irq irq; if (entry->type != KVM_IRQ_ROUTING_MSI) continue; - dest_id = (entry->msi.address_lo >> 12) & 0xff; - dest_mode = (entry->msi.address_lo >> 2) & 0x1; - level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL; - if (level && kvm_apic_match_dest(vcpu, NULL, 0, - dest_id, dest_mode)) { - u32 vector = entry->msi.data & 0xff; - - __set_bit(vector, - ioapic_handled_vectors); - } + + kvm_set_msi_irq(vcpu->kvm, entry, &irq); + + if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0, + irq.dest_id, irq.dest_mode)) + __set_bit(irq.vector, ioapic_handled_vectors); } } srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fdc05ae08bac..730cf174090a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -25,7 +25,7 @@ #include <linux/smp.h> #include <linux/hrtimer.h> #include <linux/io.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/math64.h> #include <linux/slab.h> #include <asm/processor.h> @@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -/* The logical map is definitely wrong if we have multiple - * modes at the same time. (Physical map is always right.) - */ -static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map) -{ - return !(map->mode & (map->mode - 1)); +static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, + u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { + switch (map->mode) { + case KVM_APIC_MODE_X2APIC: { + u32 offset = (dest_id >> 16) * 16; + u32 max_apic_id = map->max_apic_id; + + if (offset <= max_apic_id) { + u8 cluster_size = min(max_apic_id - offset + 1, 16U); + + *cluster = &map->phys_map[offset]; + *mask = dest_id & (0xffff >> (16 - cluster_size)); + } else { + *mask = 0; + } + + return true; + } + case KVM_APIC_MODE_XAPIC_FLAT: + *cluster = map->xapic_flat_map; + *mask = dest_id & 0xff; + return true; + case KVM_APIC_MODE_XAPIC_CLUSTER: + *cluster = map->xapic_cluster_map[dest_id >> 4]; + *mask = dest_id & 0xf; + return true; + default: + /* Not optimized. */ + return false; + } } -static inline void -apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid) +static void kvm_apic_map_free(struct rcu_head *rcu) { - unsigned lid_bits; - - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4); - BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8); - BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16); - lid_bits = map->mode; + struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu); - *cid = dest_id >> lid_bits; - *lid = dest_id & ((1 << lid_bits) - 1); + kvfree(map); } static void recalculate_apic_map(struct kvm *kvm) @@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm) struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; - - new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL); + u32 max_id = 255; mutex_lock(&kvm->arch.apic_map_lock); + kvm_for_each_vcpu(i, vcpu, kvm) + if (kvm_apic_present(vcpu)) + max_id = max(max_id, kvm_apic_id(vcpu->arch.apic)); + + new = kvm_kvzalloc(sizeof(struct kvm_apic_map) + + sizeof(struct kvm_lapic *) * ((u64)max_id + 1)); + if (!new) goto out; + new->max_apic_id = max_id; + kvm_for_each_vcpu(i, vcpu, kvm) { struct kvm_lapic *apic = vcpu->arch.apic; - u16 cid, lid; + struct kvm_lapic **cluster; + u16 mask; u32 ldr, aid; if (!kvm_apic_present(vcpu)) @@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm) aid = kvm_apic_id(apic); ldr = kvm_lapic_get_reg(apic, APIC_LDR); - if (aid < ARRAY_SIZE(new->phys_map)) + if (aid <= new->max_apic_id) new->phys_map[aid] = apic; if (apic_x2apic_mode(apic)) { @@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm) new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER; } - if (!kvm_apic_logical_map_valid(new)) + if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask)) continue; - apic_logical_id(new, ldr, &cid, &lid); - - if (lid && cid < ARRAY_SIZE(new->logical_map)) - new->logical_map[cid][ffs(lid) - 1] = apic; + if (mask) + cluster[ffs(mask) - 1] = apic; } out: old = rcu_dereference_protected(kvm->arch.apic_map, @@ -189,7 +213,7 @@ out: mutex_unlock(&kvm->arch.apic_map_lock); if (old) - kfree_rcu(old, rcu); + call_rcu(&old->rcu, kvm_apic_map_free); kvm_make_scan_ioapic_request(kvm); } @@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) } } -static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); recalculate_apic_map(apic->vcpu->kvm); @@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } -static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) { u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_lapic_set_reg(apic, APIC_ID, id << 24); + kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); recalculate_apic_map(apic->vcpu->kvm); } @@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) } } -/* KVM APIC implementation has two quirks - * - dest always begins at 0 while xAPIC MDA has offset 24, - * - IOxAPIC messages have to be delivered (directly) to x2APIC. +/* The KVM local APIC implementation has two quirks: + * + * - the xAPIC MDA stores the destination at bits 24-31, while this + * is not true of struct kvm_lapic_irq's dest_id field. This is + * just a quirk in the API and is not problematic. + * + * - in-kernel IOAPIC messages have to be delivered directly to + * x2APIC, because the kernel does not support interrupt remapping. + * In order to support broadcast without interrupt remapping, x2APIC + * rewrites the destination of non-IPI messages from APIC_BROADCAST + * to X2APIC_BROADCAST. + * + * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is + * important when userspace wants to use x2APIC-format MSIs, because + * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7". */ -static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source, - struct kvm_lapic *target) +static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id, + struct kvm_lapic *source, struct kvm_lapic *target) { bool ipi = source != NULL; bool x2apic_mda = apic_x2apic_mode(ipi ? source : target); - if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda) + if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled && + !ipi && dest_id == APIC_BROADCAST && x2apic_mda) return X2APIC_BROADCAST; return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id); @@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode) { struct kvm_lapic *target = vcpu->arch.apic; - u32 mda = kvm_apic_mda(dest, source, target); + u32 mda = kvm_apic_mda(vcpu, dest, source, target); apic_debug("target %p, source %p, dest 0x%x, " "dest_mode 0x%x, short_hand 0x%x\n", @@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm) } } -bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, - struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src, + struct kvm_lapic_irq *irq, struct kvm_apic_map *map) { - struct kvm_apic_map *map; - unsigned long bitmap = 1; - struct kvm_lapic **dst; - int i; - bool ret, x2apic_ipi; + if (kvm->arch.x2apic_broadcast_quirk_disabled) { + if ((irq->dest_id == APIC_BROADCAST && + map->mode != KVM_APIC_MODE_X2APIC)) + return true; + if (irq->dest_id == X2APIC_BROADCAST) + return true; + } else { + bool x2apic_ipi = src && *src && apic_x2apic_mode(*src); + if (irq->dest_id == (x2apic_ipi ? + X2APIC_BROADCAST : APIC_BROADCAST)) + return true; + } - *r = -1; + return false; +} - if (irq->shorthand == APIC_DEST_SELF) { - *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); - return true; - } +/* Return true if the interrupt can be handled by using *bitmap as index mask + * for valid destinations in *dst array. + * Return false if kvm_apic_map_get_dest_lapic did nothing useful. + * Note: we may have zero kvm_lapic destinations when we return true, which + * means that the interrupt should be dropped. In this case, *bitmap would be + * zero and *dst undefined. + */ +static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, + struct kvm_lapic **src, struct kvm_lapic_irq *irq, + struct kvm_apic_map *map, struct kvm_lapic ***dst, + unsigned long *bitmap) +{ + int i, lowest; - if (irq->shorthand) + if (irq->shorthand == APIC_DEST_SELF && src) { + *dst = src; + *bitmap = 1; + return true; + } else if (irq->shorthand) return false; - x2apic_ipi = src && apic_x2apic_mode(src); - if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) + if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map)) return false; - ret = true; - rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); - - if (!map) { - ret = false; - goto out; + if (irq->dest_mode == APIC_DEST_PHYSICAL) { + if (irq->dest_id > map->max_apic_id) { + *bitmap = 0; + } else { + *dst = &map->phys_map[irq->dest_id]; + *bitmap = 1; + } + return true; } - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; + *bitmap = 0; + if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst, + (u16 *)bitmap)) + return false; - dst = &map->phys_map[irq->dest_id]; - } else { - u16 cid; + if (!kvm_lowest_prio_delivery(irq)) + return true; - if (!kvm_apic_logical_map_valid(map)) { - ret = false; - goto out; + if (!kvm_vector_hashing_enabled()) { + lowest = -1; + for_each_set_bit(i, bitmap, 16) { + if (!(*dst)[i]) + continue; + if (lowest < 0) + lowest = i; + else if (kvm_apic_compare_prio((*dst)[i]->vcpu, + (*dst)[lowest]->vcpu) < 0) + lowest = i; } + } else { + if (!*bitmap) + return true; - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); + lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap), + bitmap, 16); - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; + if (!(*dst)[lowest]) { + kvm_apic_disabled_lapic_found(kvm); + *bitmap = 0; + return true; + } + } - dst = map->logical_map[cid]; + *bitmap = (lowest >= 0) ? 1 << lowest : 0; - if (!kvm_lowest_prio_delivery(irq)) - goto set_irq; + return true; +} - if (!kvm_vector_hashing_enabled()) { - int l = -1; - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (l < 0) - l = i; - else if (kvm_apic_compare_prio(dst[i]->vcpu, - dst[l]->vcpu) < 0) - l = i; - } - bitmap = (l >= 0) ? 1 << l : 0; - } else { - int idx; - unsigned int dest_vcpus; +bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, + struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map) +{ + struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; + int i; + bool ret; - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; + *r = -1; - idx = kvm_vector_to_index(irq->vector, - dest_vcpus, &bitmap, 16); + if (irq->shorthand == APIC_DEST_SELF) { + *r = kvm_apic_set_irq(src->vcpu, irq, dest_map); + return true; + } - if (!dst[idx]) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); - bitmap = (idx >= 0) ? 1 << idx : 0; + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap); + if (ret) + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (*r < 0) + *r = 0; + *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); } - } -set_irq: - for_each_set_bit(i, &bitmap, 16) { - if (!dst[i]) - continue; - if (*r < 0) - *r = 0; - *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map); - } -out: rcu_read_unlock(); return ret; } @@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu) { struct kvm_apic_map *map; + unsigned long bitmap; + struct kvm_lapic **dst = NULL; bool ret = false; - struct kvm_lapic *dst = NULL; if (irq->shorthand) return false; @@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) - goto out; - - if (irq->dest_mode == APIC_DEST_PHYSICAL) { - if (irq->dest_id == 0xFF) - goto out; - - if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) - goto out; - - dst = map->phys_map[irq->dest_id]; - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; - } else { - u16 cid; - unsigned long bitmap = 1; - int i, r = 0; - - if (!kvm_apic_logical_map_valid(map)) - goto out; - - apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap); - - if (cid >= ARRAY_SIZE(map->logical_map)) - goto out; - - if (kvm_vector_hashing_enabled() && - kvm_lowest_prio_delivery(irq)) { - int idx; - unsigned int dest_vcpus; + if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) && + hweight16(bitmap) == 1) { + unsigned long i = find_first_bit(&bitmap, 16); - dest_vcpus = hweight16(bitmap); - if (dest_vcpus == 0) - goto out; - - idx = kvm_vector_to_index(irq->vector, dest_vcpus, - &bitmap, 16); - - dst = map->logical_map[cid][idx]; - if (!dst) { - kvm_apic_disabled_lapic_found(kvm); - goto out; - } - - *dest_vcpu = dst->vcpu; - } else { - for_each_set_bit(i, &bitmap, 16) { - dst = map->logical_map[cid][i]; - if (++r == 2) - goto out; - } - - if (dst && kvm_apic_present(dst->vcpu)) - *dest_vcpu = dst->vcpu; - else - goto out; + if (dst[i]) { + *dest_vcpu = dst[i]->vcpu; + ret = true; } } - ret = true; -out: rcu_read_unlock(); return ret; } @@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) return 0; switch (offset) { - case APIC_ID: - if (apic_x2apic_mode(apic)) - val = kvm_apic_id(apic); - else - val = kvm_apic_id(apic) << 24; - break; case APIC_ARBPRI: apic_debug("Access APIC ARBPRI register which is for P6\n"); break; @@ -1310,7 +1313,8 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(tsc_deadline - guest_tsc); + __delay(min(tsc_deadline - guest_tsc, + nsec_to_cycles(vcpu, lapic_timer_advance_ns))); } static void start_sw_tscdeadline(struct kvm_lapic *apic) @@ -1349,41 +1353,52 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); +static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +{ + kvm_x86_ops->cancel_hv_timer(apic->vcpu); + apic->lapic_timer.hv_timer_in_use = false; +} + void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(!apic->lapic_timer.hv_timer_in_use); WARN_ON(swait_active(&vcpu->wq)); - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); apic_timer_expired(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); +static bool start_hv_tscdeadline(struct kvm_lapic *apic) +{ + u64 tscdeadline = apic->lapic_timer.tscdeadline; + + if (atomic_read(&apic->lapic_timer.pending) || + kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { + if (apic->lapic_timer.hv_timer_in_use) + cancel_hv_tscdeadline(apic); + } else { + apic->lapic_timer.hv_timer_in_use = true; + hrtimer_cancel(&apic->lapic_timer.timer); + + /* In case the sw timer triggered in the window */ + if (atomic_read(&apic->lapic_timer.pending)) + cancel_hv_tscdeadline(apic); + } + trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, + apic->lapic_timer.hv_timer_in_use); + return apic->lapic_timer.hv_timer_in_use; +} + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic) && - !atomic_read(&apic->lapic_timer.pending)) { - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (!kvm_x86_ops->set_hv_timer(vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - hrtimer_cancel(&apic->lapic_timer.timer); - - /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) { - apic->lapic_timer.hv_timer_in_use = false; - kvm_x86_ops->cancel_hv_timer(apic->vcpu); - } - } - trace_kvm_hv_timer_state(vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } + if (apic_lvtt_tscdeadline(apic)) + start_hv_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1395,8 +1410,7 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - kvm_x86_ops->cancel_hv_timer(vcpu); - apic->lapic_timer.hv_timer_in_use = false; + cancel_hv_tscdeadline(apic); if (atomic_read(&apic->lapic_timer.pending)) return; @@ -1451,15 +1465,7 @@ static void start_apic_timer(struct kvm_lapic *apic) ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); } else if (apic_lvtt_tscdeadline(apic)) { - /* lapic timer in tsc deadline mode */ - u64 tscdeadline = apic->lapic_timer.tscdeadline; - - if (kvm_x86_ops->set_hv_timer && - !kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { - apic->lapic_timer.hv_timer_in_use = true; - trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, - apic->lapic_timer.hv_timer_in_use); - } else + if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) start_sw_tscdeadline(apic); } } @@ -1488,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) switch (reg) { case APIC_ID: /* Local APIC ID */ if (!apic_x2apic_mode(apic)) - kvm_apic_set_id(apic, val >> 24); + kvm_apic_set_xapic_id(apic, val >> 24); else ret = 1; break; @@ -1749,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { - if (value & MSR_IA32_APICBASE_ENABLE) + if (value & MSR_IA32_APICBASE_ENABLE) { + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - else + } else static_key_slow_inc(&apic_hw_disabled.key); recalculate_apic_map(vcpu->kvm); } @@ -1791,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - if (!init_event) - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) { + kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE); + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); + } kvm_apic_set_version(apic->vcpu); for (i = 0; i < KVM_APIC_LVT_NUM; i++) @@ -1931,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) * thinking that APIC satet has changed. */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; - kvm_lapic_set_base(vcpu, - APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); - static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); @@ -2013,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) return vector; } -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s) +static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu, + struct kvm_lapic_state *s, bool set) +{ + if (apic_x2apic_mode(vcpu->arch.apic)) { + u32 *id = (u32 *)(s->regs + APIC_ID); + + if (vcpu->kvm->arch.x2apic_format) { + if (*id != vcpu->vcpu_id) + return -EINVAL; + } else { + if (set) + *id >>= 24; + else + *id <<= 24; + } + } + + return 0; +} + +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) +{ + memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s)); + return kvm_apic_state_fixup(vcpu, s, false); +} + +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { struct kvm_lapic *apic = vcpu->arch.apic; + int r; + kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); + + r = kvm_apic_state_fixup(vcpu, s, true); + if (r) + return r; memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); - /* call kvm_apic_set_id() to put apic into apic_map */ - kvm_apic_set_id(apic, kvm_apic_id(apic)); + + recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -2049,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_rtc_eoi_tracking_restore_one(vcpu); vcpu->arch.apic_arb_prio = 0; + + return 0; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 336ba51bb16e..f60d01c29d51 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -81,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); -void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, - struct kvm_lapic_state *s); +int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); +int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); @@ -200,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } -static inline int kvm_apic_id(struct kvm_lapic *apic) +static inline u32 kvm_apic_id(struct kvm_lapic *apic) { - return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff; + /* To avoid a race between apic_base and following APIC_ID update when + * switching to x2apic_mode, the x2apic mode returns initial x2apic id. + */ + if (apic_x2apic_mode(apic)) + return apic->vcpu->vcpu_id; + + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 837bf23c5b06..3d4cc8cc56a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -29,7 +29,8 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/export.h> #include <linux/swap.h> #include <linux/hugetlb.h> #include <linux/compiler.h> @@ -175,6 +176,7 @@ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; +static u64 __read_mostly shadow_present_mask; static void mmu_spte_set(u64 *sptep, u64 spte); static void mmu_free_roots(struct kvm_vcpu *vcpu); @@ -282,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) } void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_present_mask = p_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -304,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu) static int is_shadow_present_pte(u64 pte) { - return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); + return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); } static int is_large_pte(u64 pte) @@ -2245,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, { u64 spte; - BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || - VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); + BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); - spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_accessed_mask; mmu_spte_set(sptep, spte); @@ -2515,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte; + u64 spte = 0; int ret = 0; if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - spte = PT_PRESENT_MASK; + /* + * For the EPT case, shadow_present_mask is 0 if hardware + * supports exec-only page table entries. In that case, + * ACC_USER_MASK and shadow_user_mask are used to represent + * read access. See FNAME(gpte_access) in paging_tmpl.h. + */ + spte |= shadow_present_mask; if (!speculative) spte |= shadow_accessed_mask; @@ -3189,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) MMU_WARN_ON(VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); - if (!is_present_gpte(pdptr)) { + if (!(pdptr & PT_PRESENT_MASK)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } @@ -3914,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, * clearer. */ smap = cr4_smap && u && !uf && !ff; - } else - /* Not really needed: no U/S accesses on ept */ - u = 1; + } fault = (ff && !x) || (uf && !u) || (wf && !w) || (smapf && smap); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 66b33b96a31b..ddc56e91f2e4 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) return kvm_mmu_load(vcpu); } -static inline int is_present_gpte(unsigned long pte) -{ - return pte & PT_PRESENT_MASK; -} - /* * Currently, we have two sorts of write-protection, a) the first one * write-protects guest page to sync the guest modification, b) another one is diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index c146f3c262c3..0149ac59c273 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -539,6 +539,7 @@ static void mtrr_lookup_var_start(struct mtrr_iter *iter) iter->fixed = false; iter->start_max = iter->start; + iter->range = NULL; iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); __mtrr_lookup_var_next(iter); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bc019f70e0b6..a01105485315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT - return is_present_gpte(pte); + return pte & PT_PRESENT_MASK; #else return pte & 7; #endif @@ -181,13 +181,19 @@ no_present: return true; } +/* + * For PTTYPE_EPT, a page table can be executable but not readable + * on supported processors. Therefore, set_spte does not automatically + * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK + * to signify readability since it isn't used in the EPT case + */ static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | - ACC_USER_MASK; + ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5ff292778110..af523d84d102 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -238,7 +238,9 @@ module_param(nested, int, S_IRUGO); /* enable / disable AVIC */ static int avic; +#ifdef CONFIG_X86_LOCAL_APIC module_param(avic, int, S_IRUGO); +#endif static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -981,11 +983,14 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); - if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC))) - avic = false; - - if (avic) - pr_info("AVIC enabled\n"); + if (avic) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_AVIC) || + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + avic = false; + else + pr_info("AVIC enabled\n"); + } return 0; @@ -1324,7 +1329,7 @@ free_avic: static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { u64 entry; - int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu); + int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); if (!kvm_vcpu_apicv_active(vcpu)) @@ -1349,7 +1354,7 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - int h_physical_id = __default_cpu_present_to_apicid(cpu); + int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); if (!kvm_vcpu_apicv_active(vcpu)) @@ -4236,7 +4241,7 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (avic_vcpu_is_running(vcpu)) wrmsrl(SVM_AVIC_DOORBELL, - __default_cpu_present_to_apicid(vcpu->cpu)); + kvm_cpu_get_apicid(vcpu->cpu)); else kvm_vcpu_wake_up(vcpu); } @@ -4935,6 +4940,12 @@ out: static void svm_handle_external_intr(struct kvm_vcpu *vcpu) { local_irq_enable(); + /* + * We must have an instruction with interrupts enabled, so + * the timer interrupt isn't delayed by the interrupt shadow. + */ + asm("nop"); + local_irq_disable(); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e185649fb8b7..bc354f003ce1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -405,6 +405,12 @@ struct nested_vmx { /* The host-usable pointer to the above */ struct page *current_vmcs12_page; struct vmcs12 *current_vmcs12; + /* + * Cache of the guest's VMCS, existing outside of guest memory. + * Loaded from guest memory during VMPTRLD. Flushed to guest + * memory during VMXOFF, VMCLEAR, VMPTRLD. + */ + struct vmcs12 *cached_vmcs12; struct vmcs *current_shadow_vmcs; /* * Indicates if the shadow vmcs must be updated with the @@ -428,7 +434,6 @@ struct nested_vmx { struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; - u64 msr_ia32_feature_control; struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -612,6 +617,14 @@ struct vcpu_vmx { bool guest_pkru_valid; u32 guest_pkru; u32 host_pkru; + + /* + * Only bits masked by msr_ia32_feature_control_valid_bits can be set in + * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included + * in msr_ia32_feature_control_valid_bits. + */ + u64 msr_ia32_feature_control; + u64 msr_ia32_feature_control_valid_bits; }; enum segment_cache_field { @@ -851,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field) static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { - return to_vmx(vcpu)->nested.current_vmcs12; + return to_vmx(vcpu)->nested.cached_vmcs12; } static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) @@ -1105,7 +1118,7 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) /* Clear the reserved bits */ eax &= ~(0x3U << 14 | 0xfU << 28); - for (i = 0; i < sizeof(vmx_preemption_cpu_tfms)/sizeof(u32); i++) + for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++) if (eax == vmx_preemption_cpu_tfms[i]) return true; @@ -1114,9 +1127,6 @@ static inline bool cpu_has_broken_vmx_preemption_timer(void) static inline bool cpu_has_vmx_preemption_timer(void) { - if (cpu_has_broken_vmx_preemption_timer()) - return false; - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER; } @@ -1668,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask) __vmcs_writel(field, __vmcs_readl(field) | mask); } +static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS); +} + static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_ENTRY_CONTROLS, val); @@ -1696,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); } +static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx) +{ + vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS); +} + static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) { vmcs_write32(VM_EXIT_CONTROLS, val); @@ -2137,7 +2157,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) unsigned int dest; if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; do { @@ -2185,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + bool already_loaded = vmx->loaded_vmcs->cpu == cpu; if (!vmm_exclusive) kvm_cpu_vmxon(phys_addr); - else if (vmx->loaded_vmcs->cpu != cpu) + else if (!already_loaded) loaded_vmcs_clear(vmx->loaded_vmcs); - if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { - per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; - vmcs_load(vmx->loaded_vmcs->vmcs); - } - - if (vmx->loaded_vmcs->cpu != cpu) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - unsigned long sysenter_esp; - - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + if (!already_loaded) { local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2215,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) &per_cpu(loaded_vmcss_on_cpu, cpu)); crash_enable_local_vmclear(cpu); local_irq_enable(); + } + + if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { + per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; + vmcs_load(vmx->loaded_vmcs->vmcs); + } + + if (!already_loaded) { + struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + unsigned long sysenter_esp; + + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* * Linux uses per-cpu TSS and GDT, so set these when switching @@ -2245,7 +2270,8 @@ static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; /* Set SN when the vCPU is preempted */ @@ -2770,8 +2796,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_WBINVD_EXITING | - SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_PCOMMIT; + SECONDARY_EXEC_XSAVES; if (enable_ept) { /* nested EPT: emulate EPT also to L1 */ @@ -2780,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | VMX_EPT_INVEPT_BIT; + if (cpu_has_vmx_ept_execute_only()) + vmx->nested.nested_vmx_ept_caps |= + VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; /* * For nested guests, we don't do anything specific @@ -2928,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return 0; } +static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, + uint64_t val) +{ + uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits; + + return !(val & ~valid_bits); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -2969,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; - case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu)) + case MSR_IA32_MCG_EXT_CTL: + if (!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) return 1; - msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = vcpu->arch.mcg_ext_ctl; + break; + case MSR_IA32_FEATURE_CONTROL: + msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3062,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC_ADJUST: ret = kvm_set_msr_common(vcpu, msr_info); break; + case MSR_IA32_MCG_EXT_CTL: + if ((!msr_info->host_initiated && + !(to_vmx(vcpu)->msr_ia32_feature_control & + FEATURE_CONTROL_LMCE)) || + (data & ~MCG_EXT_CTL_LMCE_EN)) + return 1; + vcpu->arch.mcg_ext_ctl = data; + break; case MSR_IA32_FEATURE_CONTROL: - if (!nested_vmx_allowed(vcpu) || - (to_vmx(vcpu)->nested.msr_ia32_feature_control & + if (!vmx_feature_control_msr_valid(vcpu, data) || + (to_vmx(vcpu)->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) return 1; - vmx->nested.msr_ia32_feature_control = data; + vmx->msr_ia32_feature_control = data; if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; @@ -3333,7 +3382,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_PCOMMIT | SECONDARY_EXEC_TSC_SCALING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, @@ -3362,12 +3410,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) vmx_capability.ept, vmx_capability.vpid); } - min = VM_EXIT_SAVE_DEBUG_CONTROLS; + min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT; #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | - VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; @@ -3379,9 +3427,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; + if (cpu_has_broken_vmx_preemption_timer()) + _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; if (!(_cpu_based_2nd_exec_control & - SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || - !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)) _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; min = VM_ENTRY_LOAD_DEBUG_CONTROLS; @@ -4924,9 +4973,6 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_pml) exec_control &= ~SECONDARY_EXEC_ENABLE_PML; - /* Currently, we allow L1 guest to directly run pcommit instruction. */ - exec_control &= ~SECONDARY_EXEC_PCOMMIT; - return exec_control; } @@ -4971,9 +5017,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) + if (cpu_has_secondary_exec_ctrls()) { vmcs_write32(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control(vmx)); + } if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -5046,6 +5093,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (vmx_xsaves_supported()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); + if (enable_pml) { + ASSERT(vmx->pml_pg); + vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); + vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); + } + return 0; } @@ -6081,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - /* It is a write fault? */ - error_code = exit_qualification & PFERR_WRITE_MASK; + /* it is a read fault? */ + error_code = (exit_qualification << 2) & PFERR_USER_MASK; + /* it is a write fault? */ + error_code |= exit_qualification & PFERR_WRITE_MASK; /* It is a fetch fault? */ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; /* ept page table is present? */ - error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; + error_code |= (exit_qualification & 0x38) != 0; vcpu->arch.exit_qualification = exit_qualification; @@ -6420,9 +6475,6 @@ static __init int hardware_setup(void) for (msr = 0x800; msr <= 0x8ff; msr++) vmx_disable_intercept_msr_read_x2apic(msr); - /* According SDM, in x2apic mode, the whole id reg is used. But in - * KVM, it only use the highest eight bits. Need to intercept it */ - vmx_enable_intercept_msr_read_x2apic(0x802); /* TMCCT */ vmx_enable_intercept_msr_read_x2apic(0x839); /* TPR */ @@ -6433,10 +6485,12 @@ static __init int hardware_setup(void) vmx_disable_intercept_msr_write_x2apic(0x83f); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, + kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, - 0ull, VMX_EPT_EXECUTABLE_MASK); + 0ull, VMX_EPT_EXECUTABLE_MASK, + cpu_has_vmx_ept_execute_only() ? + 0ull : VMX_EPT_READABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else @@ -6471,6 +6525,8 @@ static __init int hardware_setup(void) kvm_set_posted_intr_wakeup_handler(wakeup_handler); + kvm_mce_cap_supported |= MCG_LMCE_P; + return alloc_kvm_area(); out8: @@ -6749,7 +6805,13 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, /* Checks for #GP/#SS exceptions. */ exn = false; - if (is_protmode(vcpu)) { + if (is_long_mode(vcpu)) { + /* Long mode: #GP(0)/#SS(0) if the memory address is in a + * non-canonical form. This is the only check on the memory + * destination for long mode! + */ + exn = is_noncanonical_address(*ret); + } else if (is_protmode(vcpu)) { /* Protected mode: apply checks for segment validity in the * following order: * - segment type check (#GP(0) may be thrown) @@ -6766,17 +6828,10 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu, * execute-only code segment */ exn = ((s.type & 0xa) == 8); - } - if (exn) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - if (is_long_mode(vcpu)) { - /* Long mode: #GP(0)/#SS(0) if the memory address is in a - * non-canonical form. This is an only check for long mode. - */ - exn = is_noncanonical_address(*ret); - } else if (is_protmode(vcpu)) { + if (exn) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. */ exn = (s.unusable != 0); @@ -6939,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } - if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) + if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { kvm_inject_gp(vcpu, 0); return 1; } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); + if (!vmx->nested.cached_vmcs12) + return -ENOMEM; + if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) + if (!shadow_vmcs) { + kfree(vmx->nested.cached_vmcs12); return -ENOMEM; + } /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7019,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); } vmx->nested.posted_intr_nv = -1; + + /* Flush VMCS12 to guest memory */ + memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12, + VMCS12_SIZE); + kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); vmx->nested.current_vmptr = -1ull; @@ -7039,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx) nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); + kfree(vmx->nested.cached_vmcs12); /* Unpin physical memory we referred to in current vmcs02 */ if (vmx->nested.apic_access_page) { nested_release_page(vmx->nested.apic_access_page); @@ -7442,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; + /* + * Load VMCS12 from guest memory since it is not already + * cached. + */ + memcpy(vmx->nested.cached_vmcs12, + vmx->nested.current_vmcs12, VMCS12_SIZE); + if (enable_shadow_vmcs) { vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); @@ -7637,13 +7711,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) return 1; } -static int handle_pcommit(struct kvm_vcpu *vcpu) -{ - /* we never catch pcommit instruct for L1 guest. */ - WARN_ON(1); - return 1; -} - static int handle_preemption_timer(struct kvm_vcpu *vcpu) { kvm_lapic_expired_hv_timer(vcpu); @@ -7700,7 +7767,6 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSAVES] = handle_xsaves, [EXIT_REASON_XRSTORS] = handle_xrstors, [EXIT_REASON_PML_FULL] = handle_pml_full, - [EXIT_REASON_PCOMMIT] = handle_pcommit, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, }; @@ -8010,8 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) * the XSS exit bitmap in vmcs12. */ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); - case EXIT_REASON_PCOMMIT: - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT); + case EXIT_REASON_PREEMPTION_TIMER: + return false; default: return true; } @@ -8023,22 +8089,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } -static int vmx_create_pml_buffer(struct vcpu_vmx *vmx) -{ - struct page *pml_pg; - - pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!pml_pg) - return -ENOMEM; - - vmx->pml_pg = pml_pg; - - vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); - vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); - - return 0; -} - static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) { if (vmx->pml_pg) { @@ -8310,6 +8360,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_PML_FULL && exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; @@ -8412,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) * the next L2->L1 exit. */ if (!is_guest_mode(vcpu) || - !nested_cpu_has2(vmx->nested.current_vmcs12, + !nested_cpu_has2(get_vmcs12(&vmx->vcpu), SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) vmcs_write64(APIC_ACCESS_ADDR, hpa); } @@ -8545,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) "push %[sp]\n\t" #endif "pushf\n\t" - "orl $0x200, (%%" _ASM_SP ")\n\t" __ASM_SIZE(push) " $%c[cs]\n\t" "call *%[entry]\n\t" : @@ -8558,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) [ss]"i"(__KERNEL_DS), [cs]"i"(__KERNEL_CS) ); - } else - local_irq_enable(); + } } static bool vmx_has_high_real_mode_segbase(void) @@ -8962,6 +9011,22 @@ static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) put_cpu(); } +/* + * Ensure that the current vmcs of the logical processor is the + * vmcs01 of the vcpu before calling free_nested(). + */ +static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int r; + + r = vcpu_load(vcpu); + BUG_ON(r); + vmx_load_vmcs01(vcpu); + free_nested(vmx); + vcpu_put(vcpu); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -8970,8 +9035,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) vmx_destroy_pml_buffer(vmx); free_vpid(vmx->vpid); leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); - free_nested(vmx); + vmx_free_vcpu_nested(vcpu); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -8993,14 +9057,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vcpu; + err = -ENOMEM; + + /* + * If PML is turned on, failure on enabling PML just results in failure + * of creating the vcpu, therefore we can simplify PML logic (by + * avoiding dealing with cases, such as enabling PML partially on vcpus + * for the guest, etc. + */ + if (enable_pml) { + vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vmx->pml_pg) + goto uninit_vcpu; + } + vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) > PAGE_SIZE); - err = -ENOMEM; - if (!vmx->guest_msrs) { - goto uninit_vcpu; - } + if (!vmx->guest_msrs) + goto free_pml; vmx->loaded_vmcs = &vmx->vmcs01; vmx->loaded_vmcs->vmcs = alloc_vmcs(); @@ -9044,17 +9120,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->nested.current_vmptr = -1ull; vmx->nested.current_vmcs12 = NULL; - /* - * If PML is turned on, failure on enabling PML just results in failure - * of creating the vcpu, therefore we can simplify PML logic (by - * avoiding dealing with cases, such as enabling PML partially on vcpus - * for the guest, etc. - */ - if (enable_pml) { - err = vmx_create_pml_buffer(vmx); - if (err) - goto free_vmcs; - } + vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED; return &vmx->vcpu; @@ -9063,6 +9129,8 @@ free_vmcs: free_loaded_vmcs(vmx->loaded_vmcs); free_msrs: kfree(vmx->guest_msrs); +free_pml: + vmx_destroy_pml_buffer(vmx); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: @@ -9195,14 +9263,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (cpu_has_secondary_exec_ctrls()) vmcs_set_secondary_exec_control(secondary_exec_ctl); - if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) { - if (guest_cpuid_has_pcommit(vcpu)) - vmx->nested.nested_vmx_secondary_ctls_high |= - SECONDARY_EXEC_PCOMMIT; - else - vmx->nested.nested_vmx_secondary_ctls_high &= - ~SECONDARY_EXEC_PCOMMIT; - } + if (nested_vmx_allowed(vcpu)) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9759,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; - exec_control |= vmcs_config.pin_based_exec_ctrl; + + /* Preemption timer setting is only taken from vmcs01. */ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control |= vmcs_config.pin_based_exec_ctrl; + if (vmx->hv_deadline_tsc == -1) + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { /* * Note that we use L0's vector here and in @@ -9815,8 +9886,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | - SECONDARY_EXEC_APIC_REGISTER_VIRT | - SECONDARY_EXEC_PCOMMIT); + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -10680,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); - vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); + vm_entry_controls_reset_shadow(vmx); + vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); /* if no vmcs02 cache requested, remove the one we used */ @@ -10690,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if TSC was changed while L2 ran */ + /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (vmx->hv_deadline_tsc == -1) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + else + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; @@ -10793,9 +10869,9 @@ static inline int u64_shl_div_u64(u64 a, unsigned int shift, static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tscl = rdtsc(), delta_tsc; - - delta_tsc = guest_deadline_tsc - kvm_read_l1_tsc(vcpu, tscl); + u64 tscl = rdtsc(); + u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl); + u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; /* Convert to host delta tsc if tsc scaling is enabled */ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio && @@ -10881,7 +10957,8 @@ static int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return 0; vcpu->pre_pcpu = vcpu->cpu; @@ -10958,7 +11035,8 @@ static void pi_post_block(struct kvm_vcpu *vcpu) unsigned long flags; if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(vcpu)) return; do { @@ -11019,7 +11097,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, int idx, ret = -EINVAL; if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + !irq_remapping_cap(IRQ_POSTING_CAP) || + !kvm_vcpu_apicv_active(kvm->vcpus[0])) return 0; idx = srcu_read_lock(&kvm->irq_srcu); @@ -11042,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, * We will support full lowest-priority interrupt later. */ - kvm_set_msi_irq(e, &irq); + kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { /* * Make sure the IRTE is in remapped mode if @@ -11087,6 +11166,16 @@ out: return ret; } +static void vmx_setup_mce(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mcg_cap & MCG_LMCE_P) + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_LMCE; + else + to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= + ~FEATURE_CONTROL_LMCE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11216,6 +11305,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_hv_timer = vmx_set_hv_timer, .cancel_hv_timer = vmx_cancel_hv_timer, #endif + + .setup_mce = vmx_setup_mce, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 299219630c94..19f9f9e05c2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -36,7 +36,8 @@ #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> -#include <linux/module.h> +#include <linux/export.h> +#include <linux/moduleparam.h> #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> @@ -55,9 +56,6 @@ #include <linux/irqbypass.h> #include <trace/events/kvm.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/desc.h> @@ -68,9 +66,13 @@ #include <asm/div64.h> #include <asm/irq_remapping.h> +#define CREATE_TRACE_POINTS +#include "trace.h" + #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; +EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) @@ -89,6 +91,9 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ + KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); @@ -539,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) goto out; } for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { - if (is_present_gpte(pdpte[i]) && + if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) { ret = 0; @@ -984,6 +989,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, }; @@ -1163,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec boot; + struct timespec64 boot; if (!wall_clock) return; @@ -1186,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - getboottime(&boot); + getboottime64(&boot); if (kvm->arch.kvmclock_offset) { - struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); - boot = timespec_sub(boot, ts); + struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); + boot = timespec64_sub(boot, ts); } - wc.sec = boot.tv_sec; + wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ wc.nsec = boot.tv_nsec; wc.version = version; @@ -1246,12 +1252,6 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); static unsigned long max_tsc_khz; -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) -{ - return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, - vcpu->arch.virtual_tsc_shift); -} - static u32 adjust_tsc_khz(u32 khz, s32 ppm) { u64 v = (u64)khz * (1000000 + ppm); @@ -2623,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_X2APIC_API: + r = KVM_X2APIC_API_VALID_FLAGS; + break; default: r = 0; break; @@ -2685,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp, break; } case KVM_X86_GET_MCE_CAP_SUPPORTED: { - u64 mce_cap; - - mce_cap = KVM_MCE_CAP_SUPPORTED; r = -EFAULT; - if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + if (copy_to_user(argp, &kvm_mce_cap_supported, + sizeof(kvm_mce_cap_supported))) goto out; r = 0; break; @@ -2779,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - - return 0; + return kvm_apic_get_state(vcpu, s); } static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - kvm_apic_post_state_restore(vcpu, s); + int r; + + r = kvm_apic_set_state(vcpu, s); + if (r) + return r; update_cr8_intercept(vcpu); return 0; @@ -2872,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; @@ -2882,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, /* Init IA32_MCi_CTL to all 1s */ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; + + if (kvm_x86_ops->setup_mce) + kvm_x86_ops->setup_mce(vcpu); out: return r; } @@ -3794,6 +3800,18 @@ split_irqchip_unlock: mutex_unlock(&kvm->lock); break; } + case KVM_CAP_X2APIC_API: + r = -EINVAL; + if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) + break; + + if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) + kvm->arch.x2apic_format = true; + if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) + kvm->arch.x2apic_broadcast_quirk_disabled = true; + + r = 0; + break; default: r = -EINVAL; break; @@ -5560,9 +5578,10 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void tsc_bad(void *info) +static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); + return 0; } static void tsc_khz_changed(void *data) @@ -5667,35 +5686,18 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = { .notifier_call = kvmclock_cpufreq_notifier }; -static int kvmclock_cpu_notifier(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int kvmclock_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, tsc_bad, NULL, 1); - break; - } - return NOTIFY_OK; + tsc_khz_changed(NULL); + return 0; } -static struct notifier_block kvmclock_cpu_notifier_block = { - .notifier_call = kvmclock_cpu_notifier, - .priority = -INT_MAX -}; - static void kvm_timer_init(void) { int cpu; max_tsc_khz = tsc_khz; - cpu_notifier_register_begin(); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; @@ -5710,12 +5712,9 @@ static void kvm_timer_init(void) CPUFREQ_TRANSITION_NOTIFIER); } pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); - for_each_online_cpu(cpu) - smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); - - __register_hotcpu_notifier(&kvmclock_cpu_notifier_block); - cpu_notifier_register_done(); + cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE", + kvmclock_cpu_online, kvmclock_cpu_down_prep); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -5875,8 +5874,8 @@ int kvm_arch_init(void *opaque) kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); - + PT_DIRTY_MASK, PT64_NX_MASK, 0, + PT_PRESENT_MASK); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -5904,7 +5903,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); + cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); #endif @@ -6655,7 +6654,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); wait_lapic_expire(vcpu); - __kvm_guest_enter(); + guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6706,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; - /* - * We must have an instruction between local_irq_enable() and - * kvm_guest_exit(), so the timer interrupt isn't delayed by - * the interrupt shadow. The stat.exits increment will do nicely. - * But we need to prevent reordering, hence this barrier(): - */ - barrier(); - - kvm_guest_exit(); + guest_exit_irqoff(); + local_irq_enable(); preempt_enable(); vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); @@ -7911,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); - kfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kvm_mmu_uninit_vm(kvm); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 7ce3634ab5fe..a82ca466b62e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -2,6 +2,7 @@ #define ARCH_X86_KVM_X86_H #include <linux/kvm_host.h> +#include <asm/pvclock.h> #include "kvm_cache_regs.h" #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL @@ -195,6 +196,12 @@ extern unsigned int lapic_timer_advance_ns; extern struct static_key kvm_no_apic_vcpu; +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +{ + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); +} + /* Same "calling convention" as do_div: * - divide (n << 32) by base * - put result in n |