summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c408
1 files changed, 232 insertions, 176 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c805cf494154..6c633de84dd7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -36,7 +36,8 @@
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
-#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
#include <linux/mman.h>
#include <linux/highmem.h>
#include <linux/iommu.h>
@@ -55,9 +56,6 @@
#include <linux/irqbypass.h>
#include <trace/events/kvm.h>
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/desc.h>
@@ -68,9 +66,13 @@
#include <asm/div64.h>
#include <asm/irq_remapping.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -89,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -113,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -537,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_gpte(pdpte[i]) &&
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
@@ -982,6 +989,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
};
@@ -1161,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec boot;
+ struct timespec64 boot;
if (!wall_clock)
return;
@@ -1184,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
- getboottime(&boot);
+ getboottime64(&boot);
if (kvm->arch.kvmclock_offset) {
- struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
- boot = timespec_sub(boot, ts);
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
}
- wc.sec = boot.tv_sec;
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
@@ -1244,12 +1252,6 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
static unsigned long max_tsc_khz;
-static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
-{
- return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
- vcpu->arch.virtual_tsc_shift);
-}
-
static u32 adjust_tsc_khz(u32 khz, s32 ppm)
{
u64 v = (u64)khz * (1000000 + ppm);
@@ -1365,7 +1367,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
{
- u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ u64 curr_offset = vcpu->arch.tsc_offset;
vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
@@ -1411,6 +1413,12 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+ kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ vcpu->arch.tsc_offset = offset;
+}
+
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
@@ -1423,7 +1431,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = get_kernel_ns();
+ ns = ktime_get_boot_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -1520,7 +1528,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
update_ia32_tsc_adjust_msr(vcpu, offset);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1714,6 +1722,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
#endif
}
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+ struct kvm_arch *ka = &kvm->arch;
+ s64 ns;
+
+ if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+ u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+ } else {
+ ns = ktime_get_boot_ns() + ka->kvmclock_offset;
+ }
+
+ return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+ unsigned long flags;
+ s64 ns;
+
+ local_irq_save(flags);
+ ns = __get_kvmclock_ns(kvm);
+ local_irq_restore(flags);
+
+ return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct pvclock_vcpu_time_info guest_hv_clock;
+
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return;
+
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
+ */
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
unsigned long flags, tgt_tsc_khz;
@@ -1721,7 +1811,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
struct kvm_arch *ka = &v->kvm->arch;
s64 kernel_ns;
u64 tsc_timestamp, host_tsc;
- struct pvclock_vcpu_time_info guest_hv_clock;
u8 pvclock_flags;
bool use_master_clock;
@@ -1750,7 +1839,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = get_kernel_ns();
+ kernel_ns = ktime_get_boot_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -1775,8 +1864,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_restore(flags);
- if (!vcpu->pv_time_enabled)
- return 0;
+ /* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
@@ -1788,64 +1876,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = tgt_tsc_khz;
}
- /* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
-
- /* This VCPU is paused, but it's legal for a guest to read another
- * VCPU's kvmclock, so we really have to follow the specification where
- * it says that version is odd if data is being modified, and even after
- * it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
- */
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
-
- smp_wmb();
-
- /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
/* If the host uses TSC clocksource, then it is stable */
+ pvclock_flags = 0;
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
vcpu->hv_clock.flags = pvclock_flags;
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
-
- smp_wmb();
-
- vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ if (vcpu->pv_time_enabled)
+ kvm_setup_pvclock_page(v);
+ if (v == kvm_get_vcpu(v->kvm, 0))
+ kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -2314,6 +2359,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
+ case MSR_IA32_PERF_CTL:
msr_info->data = 0;
break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
@@ -2620,6 +2666,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
default:
r = 0;
break;
@@ -2682,11 +2731,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_X86_GET_MCE_CAP_SUPPORTED: {
- u64 mce_cap;
-
- mce_cap = KVM_MCE_CAP_SUPPORTED;
r = -EFAULT;
- if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
goto out;
r = 0;
break;
@@ -2738,12 +2785,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
+
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
- kvm_x86_ops->write_tsc_offset(vcpu, offset);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
+ if (kvm_lapic_hv_timer_in_use(vcpu) &&
+ kvm_x86_ops->set_hv_timer(vcpu,
+ kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_lapic_switch_to_sw_timer(vcpu);
/*
* On a host with synchronized TSC, there is no need to update
* kvmclock on vcpu->cpu migration
@@ -2771,15 +2823,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
- return 0;
+ return kvm_apic_get_state(vcpu, s);
}
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_apic_post_state_restore(vcpu, s);
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
update_cr8_intercept(vcpu);
return 0;
@@ -2864,7 +2918,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
- if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
@@ -2874,6 +2928,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
/* Init IA32_MCi_CTL to all 1s */
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
@@ -2972,6 +3029,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SMM))
return -EINVAL;
+ if (events->exception.injected &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+ return -EINVAL;
+
process_nmi(vcpu);
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
@@ -3036,6 +3097,11 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
+ if (dbgregs->dr6 & ~0xffffffffull)
+ return -EINVAL;
+ if (dbgregs->dr7 & ~0xffffffffull)
+ return -EINVAL;
+
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
@@ -3763,7 +3829,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
@@ -3777,6 +3843,18 @@ split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -3828,7 +3906,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpic)
goto create_irqchip_unlock;
r = -EINVAL;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto create_irqchip_unlock;
r = -ENOMEM;
vpic = kvm_create_pic(kvm);
@@ -3868,7 +3946,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -3877,7 +3955,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->slots_lock);
+ mutex_unlock(&kvm->lock);
break;
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3984,7 +4062,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm->created_vcpus)
r = -EBUSY;
else
kvm->arch.bsp_vcpu_id = arg;
@@ -4004,7 +4082,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_CLOCK: {
struct kvm_clock_data user_ns;
u64 now_ns;
- s64 delta;
r = -EFAULT;
if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4016,10 +4093,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
local_irq_disable();
- now_ns = get_kernel_ns();
- delta = user_ns.clock - now_ns;
+ now_ns = __get_kvmclock_ns(kvm);
+ kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
local_irq_enable();
- kvm->arch.kvmclock_offset = delta;
kvm_gen_update_masterclock(kvm);
break;
}
@@ -4027,10 +4103,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
struct kvm_clock_data user_ns;
u64 now_ns;
- local_irq_disable();
- now_ns = get_kernel_ns();
- user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
- local_irq_enable();
+ now_ns = get_kvmclock_ns(kvm);
+ user_ns.clock = now_ns;
user_ns.flags = 0;
memset(&user_ns.pad, 0, sizeof(user_ns.pad));
@@ -5292,13 +5366,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
- if (unlikely(vcpu->arch.smi_pending)) {
- kvm_make_request(KVM_REQ_SMI, vcpu);
- vcpu->arch.smi_pending = 0;
- } else {
- /* Process a latched INIT, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
kvm_mmu_reset_context(vcpu);
@@ -5548,9 +5617,10 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
}
EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
-static void tsc_bad(void *info)
+static int kvmclock_cpu_down_prep(unsigned int cpu)
{
__this_cpu_write(cpu_tsc_khz, 0);
+ return 0;
}
static void tsc_khz_changed(void *data)
@@ -5655,35 +5725,18 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
.notifier_call = kvmclock_cpufreq_notifier
};
-static int kvmclock_cpu_notifier(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
+static int kvmclock_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, tsc_bad, NULL, 1);
- break;
- }
- return NOTIFY_OK;
+ tsc_khz_changed(NULL);
+ return 0;
}
-static struct notifier_block kvmclock_cpu_notifier_block = {
- .notifier_call = kvmclock_cpu_notifier,
- .priority = -INT_MAX
-};
-
static void kvm_timer_init(void)
{
int cpu;
max_tsc_khz = tsc_khz;
- cpu_notifier_register_begin();
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
@@ -5698,12 +5751,9 @@ static void kvm_timer_init(void)
CPUFREQ_TRANSITION_NOTIFIER);
}
pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
- for_each_online_cpu(cpu)
- smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
-
- __register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
- cpu_notifier_register_done();
+ cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "AP_X86_KVM_CLK_ONLINE",
+ kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -5863,8 +5913,8 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -5892,7 +5942,7 @@ void kvm_arch_exit(void)
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
- unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
#endif
@@ -6098,7 +6148,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
@@ -6121,6 +6174,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_irq(vcpu);
}
}
+
return 0;
}
@@ -6144,7 +6198,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
#define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
@@ -6158,7 +6212,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
return flags;
}
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6173,11 +6227,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
}
#ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6186,7 +6240,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
- flags = process_smi_get_segment_flags(&seg) >> 8;
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6194,7 +6248,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
}
#endif
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
@@ -6218,13 +6272,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6235,7 +6289,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
- process_smi_save_seg_32(vcpu, buf, i);
+ enter_smm_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
@@ -6244,7 +6298,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
@@ -6276,7 +6330,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
@@ -6286,7 +6340,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
@@ -6295,31 +6349,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
- process_smi_save_seg_64(vcpu, buf, i);
+ enter_smm_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
char buf[512];
u32 cr0;
- if (is_smm(vcpu)) {
- vcpu->arch.smi_pending = true;
- return;
- }
-
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has_longmode(vcpu))
- process_smi_save_state_64(vcpu, buf);
+ enter_smm_save_state_64(vcpu, buf);
else
- process_smi_save_state_32(vcpu, buf);
+ enter_smm_save_state_32(vcpu, buf);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
@@ -6375,6 +6424,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6569,8 +6624,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
- /* enable NMI/IRQ window open exits if needed */
else {
+ /* Enable NMI/IRQ window open exits if needed.
+ *
+ * SMIs have two cases: 1) they can be nested, and
+ * then there is nothing to do here because RSM will
+ * cause a vmexit anyway; 2) or the SMI can be pending
+ * because inject_pending_event has completed the
+ * injection of an IRQ or NMI from the previous vmexit,
+ * and then we request an immediate exit to inject the SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6621,12 +6686,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_xcr0(vcpu);
- if (req_immediate_exit)
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
smp_send_reschedule(vcpu->cpu);
+ }
trace_kvm_entry(vcpu->vcpu_id);
wait_lapic_expire(vcpu);
- __kvm_guest_enter();
+ guest_enter_irqoff();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -6672,21 +6739,13 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_put_guest_xcr0(vcpu);
- /* Interrupt is enabled by handle_external_intr() */
kvm_x86_ops->handle_external_intr(vcpu);
++vcpu->stat.exits;
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
+ guest_exit_irqoff();
+ local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7423,6 +7482,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
vcpu->arch.hflags = 0;
+ vcpu->arch.smi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7508,7 +7568,7 @@ int kvm_arch_hardware_enable(void)
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, get_kernel_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boot_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -7615,11 +7675,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
- return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
@@ -7748,6 +7803,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+ kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
pvclock_update_vm_gtod_copy(kvm);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
@@ -7815,7 +7871,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
slot = id_to_memslot(slots, id);
if (size) {
- if (WARN_ON(slot->npages))
+ if (slot->npages)
return -EEXIST;
/*
@@ -7886,7 +7942,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
}
@@ -8394,7 +8450,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabed or the consumer side (KVM
+ * when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
OpenPOWER on IntegriCloud