summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c49
-rw-r--r--arch/x86/kvm/cpuid.h9
-rw-r--r--arch/x86/kvm/emulate.c546
-rw-r--r--arch/x86/kvm/i8254.c31
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c17
-rw-r--r--arch/x86/kvm/lapic.c225
-rw-r--r--arch/x86/kvm/lapic.h11
-rw-r--r--arch/x86/kvm/mmu.c658
-rw-r--r--arch/x86/kvm/mmu_audit.c10
-rw-r--r--arch/x86/kvm/mmutrace.h45
-rw-r--r--arch/x86/kvm/paging_tmpl.h5
-rw-r--r--arch/x86/kvm/pmu.c22
-rw-r--r--arch/x86/kvm/svm.c21
-rw-r--r--arch/x86/kvm/trace.h46
-rw-r--r--arch/x86/kvm/vmx.c230
-rw-r--r--arch/x86/kvm/x86.c399
-rw-r--r--arch/x86/kvm/x86.h2
19 files changed, 1658 insertions, 676 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1a7fe868f375..a28f338843ea 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
select TASKSTATS
select TASK_DELAY_ACCT
select PERF_EVENTS
+ select HAVE_KVM_MSI
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 9fed5bedaad6..0595f1397b7c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_lm = 0;
#endif
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+ unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
- 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
F(F16C) | F(RDRAND);
@@ -247,7 +248,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ebx */
const u32 kvm_supported_word9_x86_features =
- F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
+ F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
+ F(BMI2) | F(ERMS) | f_invpcid | F(RTM);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -397,7 +399,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
case KVM_CPUID_SIGNATURE: {
char signature[12] = "KVMKVMKVM\0\0";
u32 *sigptr = (u32 *)signature;
- entry->eax = 0;
+ entry->eax = KVM_CPUID_FEATURES;
entry->ebx = sigptr[0];
entry->ecx = sigptr[1];
entry->edx = sigptr[2];
@@ -408,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
(1 << KVM_FEATURE_NOP_IO_DELAY) |
(1 << KVM_FEATURE_CLOCKSOURCE2) |
(1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_PV_EOI) |
(1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
if (sched_info_on())
@@ -638,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
}
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
- u32 function, index;
+ u32 function = *eax, index = *ecx;
struct kvm_cpuid_entry2 *best;
- function = kvm_register_read(vcpu, VCPU_REGS_RAX);
- index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = kvm_find_cpuid_entry(vcpu, function, index);
if (!best)
best = check_cpuid_limit(vcpu, function, index);
if (best) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
- }
+ *eax = best->eax;
+ *ebx = best->ebx;
+ *ecx = best->ecx;
+ *edx = best->edx;
+ } else
+ *eax = *ebx = *ecx = *edx = 0;
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 function, eax, ebx, ecx, edx;
+
+ function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
kvm_x86_ops->skip_emulated_instruction(vcpu);
- trace_kvm_cpuid(function,
- kvm_register_read(vcpu, VCPU_REGS_RAX),
- kvm_register_read(vcpu, VCPU_REGS_RBX),
- kvm_register_read(vcpu, VCPU_REGS_RCX),
- kvm_register_read(vcpu, VCPU_REGS_RDX));
+ trace_kvm_cpuid(function, eax, ebx, ecx, edx);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26d1fb437eb5..a10e46016851 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
@@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_OSVW));
}
+static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_PCID));
+}
+
#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 83756223f8aa..97d9a9914ba8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -142,6 +142,10 @@
#define Src2FS (OpFS << Src2Shift)
#define Src2GS (OpGS << Src2Shift)
#define Src2Mask (OpMask << Src2Shift)
+#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
+#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
#define X2(x...) x, x
#define X3(x...) X2(x), x
@@ -429,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
return ctxt->ops->intercept(ctxt, &info, stage);
}
+static void assign_masked(ulong *dest, ulong src, ulong mask)
+{
+ *dest = (*dest & ~mask) | (src & mask);
+}
+
static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
{
return (1UL << (ctxt->ad_bytes << 3)) - 1;
}
+static ulong stack_mask(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel;
+ struct desc_struct ss;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return ~0UL;
+ ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS);
+ return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */
+}
+
+static int stack_size(struct x86_emulate_ctxt *ctxt)
+{
+ return (__fls(stack_mask(ctxt)) + 1) >> 3;
+}
+
/* Access/update address held in a register, based on addressing mode. */
static inline unsigned long
address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
@@ -557,6 +582,29 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
}
+/*
+ * x86 defines three classes of vector instructions: explicitly
+ * aligned, explicitly unaligned, and the rest, which change behaviour
+ * depending on whether they're AVX encoded or not.
+ *
+ * Also included is CMPXCHG16B which is not a vector instruction, yet it is
+ * subject to the same check.
+ */
+static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+{
+ if (likely(size < 16))
+ return false;
+
+ if (ctxt->d & Aligned)
+ return true;
+ else if (ctxt->d & Unaligned)
+ return false;
+ else if (ctxt->d & Avx)
+ return false;
+ else
+ return true;
+}
+
static int __linearize(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
unsigned size, bool write, bool fetch,
@@ -621,6 +669,8 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
}
if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
la &= (u32)-1;
+ if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+ return emulate_gp(ctxt, 0);
*linear = la;
return X86EMUL_CONTINUE;
bad:
@@ -859,6 +909,40 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
ctxt->ops->put_fpu(ctxt);
}
+static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
+ case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
+ case 2: asm("movq %%mm2, %0" : "=m"(*data)); break;
+ case 3: asm("movq %%mm3, %0" : "=m"(*data)); break;
+ case 4: asm("movq %%mm4, %0" : "=m"(*data)); break;
+ case 5: asm("movq %%mm5, %0" : "=m"(*data)); break;
+ case 6: asm("movq %%mm6, %0" : "=m"(*data)); break;
+ case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
+static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+{
+ ctxt->ops->get_fpu(ctxt);
+ switch (reg) {
+ case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
+ case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
+ case 2: asm("movq %0, %%mm2" : : "m"(*data)); break;
+ case 3: asm("movq %0, %%mm3" : : "m"(*data)); break;
+ case 4: asm("movq %0, %%mm4" : : "m"(*data)); break;
+ case 5: asm("movq %0, %%mm5" : : "m"(*data)); break;
+ case 6: asm("movq %0, %%mm6" : : "m"(*data)); break;
+ case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
+ default: BUG();
+ }
+ ctxt->ops->put_fpu(ctxt);
+}
+
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
@@ -875,6 +959,13 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
read_sse_reg(ctxt, &op->vec_val, reg);
return;
}
+ if (ctxt->d & Mmx) {
+ reg &= 7;
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.mm = reg;
+ return;
+ }
op->type = OP_REG;
if (ctxt->d & ByteOp) {
@@ -888,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->orig_val = op->val;
}
+static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg)
+{
+ if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP)
+ ctxt->modrm_seg = VCPU_SREG_SS;
+}
+
static int decode_modrm(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
@@ -902,7 +999,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
}
- ctxt->modrm = insn_fetch(u8, ctxt);
ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -920,6 +1016,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
return rc;
}
+ if (ctxt->d & Mmx) {
+ op->type = OP_MM;
+ op->bytes = 8;
+ op->addr.xmm = ctxt->modrm_rm & 7;
+ return rc;
+ }
fetch_register_operand(op);
return rc;
}
@@ -986,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
modrm_ea += insn_fetch(s32, ctxt);
- else
+ else {
modrm_ea += ctxt->regs[base_reg];
+ adjust_modrm_seg(ctxt, base_reg);
+ }
if (index_reg != 4)
modrm_ea += ctxt->regs[index_reg] << scale;
} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->rip_relative = 1;
- } else
- modrm_ea += ctxt->regs[ctxt->modrm_rm];
+ } else {
+ base_reg = ctxt->modrm_rm;
+ modrm_ea += ctxt->regs[base_reg];
+ adjust_modrm_seg(ctxt, base_reg);
+ }
switch (ctxt->modrm_mod) {
case 0:
if (ctxt->modrm_rm == 5)
@@ -1189,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
/* allowed just for 8 bytes segments */
static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc)
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
@@ -1200,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- addr = dt.address + index * 8;
+ *desc_addr_p = addr = dt.address + index * 8;
return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
&ctxt->exception);
}
@@ -1227,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
- struct desc_struct seg_desc;
+ struct desc_struct seg_desc, old_desc;
u8 dpl, rpl, cpl;
unsigned err_vec = GP_VECTOR;
u32 err_code = 0;
bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+ ulong desc_addr;
int ret;
memset(&seg_desc, 0, sizeof seg_desc);
@@ -1249,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto load;
}
- /* NULL selector is not valid for TR, CS and SS */
- if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+ rpl = selector & 3;
+ cpl = ctxt->ops->cpl(ctxt);
+
+ /* NULL selector is not valid for TR, CS and SS (except for long mode) */
+ if ((seg == VCPU_SREG_CS
+ || (seg == VCPU_SREG_SS
+ && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
+ || seg == VCPU_SREG_TR)
&& null_selector)
goto exception;
@@ -1261,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (null_selector) /* for NULL selector skip all following checks */
goto load;
- ret = read_segment_descriptor(ctxt, selector, &seg_desc);
+ ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -1277,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
}
- rpl = selector & 3;
dpl = seg_desc.dpl;
- cpl = ctxt->ops->cpl(ctxt);
switch (seg) {
case VCPU_SREG_SS:
@@ -1309,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
break;
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
@@ -1387,6 +1506,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
case OP_XMM:
write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
break;
+ case OP_MM:
+ write_mmx_reg(ctxt, &ctxt->dst.mm_val, ctxt->dst.addr.mm);
+ break;
case OP_NONE:
/* no writeback */
break;
@@ -1396,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_push(struct x86_emulate_ctxt *ctxt)
+static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes)
{
struct segmented_address addr;
- register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+ register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes);
addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
addr.seg = VCPU_SREG_SS;
+ return segmented_write(ctxt, addr, data, bytes);
+}
+
+static int em_push(struct x86_emulate_ctxt *ctxt)
+{
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
- return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
+ return push(ctxt, &ctxt->src.val, ctxt->op_bytes);
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1478,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
}
+static int em_enter(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ unsigned frame_size = ctxt->src.val;
+ unsigned nesting_level = ctxt->src2.val & 31;
+
+ if (nesting_level)
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP],
+ stack_mask(ctxt));
+ assign_masked(&ctxt->regs[VCPU_REGS_RSP],
+ ctxt->regs[VCPU_REGS_RSP] - frame_size,
+ stack_mask(ctxt));
+ return X86EMUL_CONTINUE;
+}
+
+static int em_leave(struct x86_emulate_ctxt *ctxt)
+{
+ assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP],
+ stack_mask(ctxt));
+ return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes);
+}
+
static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
@@ -1915,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
u32 eax, ebx, ecx, edx;
eax = ecx = 0;
- return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)
- && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx
&& ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx
&& edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx;
}
@@ -1935,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt)
eax = 0x00000000;
ecx = 0x00000000;
- if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) {
- /*
- * Intel ("GenuineIntel")
- * remark: Intel CPUs only support "syscall" in 64bit
- * longmode. Also an 64bit guest with a
- * 32bit compat-app running will #UD !! While this
- * behaviour can be fixed (by emulating) into AMD
- * response - CPUs of AMD can't behave like Intel.
- */
- if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
- edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
- return false;
+ ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ /*
+ * Intel ("GenuineIntel")
+ * remark: Intel CPUs only support "syscall" in 64bit
+ * longmode. Also an 64bit guest with a
+ * 32bit compat-app running will #UD !! While this
+ * behaviour can be fixed (by emulating) into AMD
+ * response - CPUs of AMD can't behave like Intel.
+ */
+ if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx)
+ return false;
- /* AMD ("AuthenticAMD") */
- if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
- return true;
-
- /* AMD ("AMDisbetter!") */
- if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
- ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
- edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
- return true;
- }
+ /* AMD ("AuthenticAMD") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
+ return true;
+
+ /* AMD ("AMDisbetter!") */
+ if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx &&
+ ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx &&
+ edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx)
+ return true;
/* default: (not Intel, not AMD), apply Intel's stricter rules... */
return false;
@@ -2469,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ulong old_tss_base =
ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
+ ulong desc_addr;
/* FIXME: old_tss_base == ~0 ? */
- ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
+ ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
+ ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2790,7 +2944,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.val = ctxt->src.val;
+ memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes);
return X86EMUL_CONTINUE;
}
@@ -2870,10 +3024,22 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
}
-static int em_movdqu(struct x86_emulate_ctxt *ctxt)
+static int em_lldt(struct x86_emulate_ctxt *ctxt)
{
- memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
- return X86EMUL_CONTINUE;
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR);
+}
+
+static int em_ltr(struct x86_emulate_ctxt *ctxt)
+{
+ u16 sel = ctxt->src.val;
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR);
}
static int em_invlpg(struct x86_emulate_ctxt *ctxt)
@@ -2917,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
+ void (*get)(struct x86_emulate_ctxt *ctxt,
+ struct desc_ptr *ptr))
+{
+ struct desc_ptr desc_ptr;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
+ get(ctxt, &desc_ptr);
+ if (ctxt->op_bytes == 2) {
+ ctxt->op_bytes = 4;
+ desc_ptr.address &= 0x00ffffff;
+ }
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return segmented_write(ctxt, ctxt->dst.addr.mem,
+ &desc_ptr, 2 + ctxt->op_bytes);
+}
+
+static int em_sgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt);
+}
+
+static int em_sidt(struct x86_emulate_ctxt *ctxt)
+{
+ return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
+}
+
static int em_lgdt(struct x86_emulate_ctxt *ctxt)
{
struct desc_ptr desc_ptr;
int rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
rc = read_descriptor(ctxt, ctxt->src.addr.mem,
&desc_ptr.size, &desc_ptr.address,
ctxt->op_bytes);
@@ -2949,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt)
struct desc_ptr desc_ptr;
int rc;
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ctxt->op_bytes = 8;
rc = read_descriptor(ctxt, ctxt->src.addr.mem,
&desc_ptr.size, &desc_ptr.address,
ctxt->op_bytes);
@@ -3061,34 +3260,48 @@ static int em_btc(struct x86_emulate_ctxt *ctxt)
static int em_bsf(struct x86_emulate_ctxt *ctxt)
{
- u8 zf;
-
- __asm__ ("bsf %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
-
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- }
+ emulate_2op_SrcV_nobyte(ctxt, "bsf");
return X86EMUL_CONTINUE;
}
static int em_bsr(struct x86_emulate_ctxt *ctxt)
{
- u8 zf;
+ emulate_2op_SrcV_nobyte(ctxt, "bsr");
+ return X86EMUL_CONTINUE;
+}
- __asm__ ("bsr %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
+static int em_cpuid(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = ctxt->regs[VCPU_REGS_RAX];
+ ecx = ctxt->regs[VCPU_REGS_RCX];
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ ctxt->regs[VCPU_REGS_RAX] = eax;
+ ctxt->regs[VCPU_REGS_RBX] = ebx;
+ ctxt->regs[VCPU_REGS_RCX] = ecx;
+ ctxt->regs[VCPU_REGS_RDX] = edx;
+ return X86EMUL_CONTINUE;
+}
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
+static int em_lahf(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL;
+ ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bswap(struct x86_emulate_ctxt *ctxt)
+{
+ switch (ctxt->op_bytes) {
+#ifdef CONFIG_X86_64
+ case 8:
+ asm("bswap %0" : "+r"(ctxt->dst.val));
+ break;
+#endif
+ default:
+ asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val));
+ break;
}
return X86EMUL_CONTINUE;
}
@@ -3286,8 +3499,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
.check_perm = (_p) }
#define N D(0)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
-#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
-#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
+#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
+#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define II(_f, _e, _i) \
{ .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
@@ -3307,25 +3520,25 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
static struct opcode group7_rm1[] = {
- DI(SrcNone | ModRM | Priv, monitor),
- DI(SrcNone | ModRM | Priv, mwait),
+ DI(SrcNone | Priv, monitor),
+ DI(SrcNone | Priv, mwait),
N, N, N, N, N, N,
};
static struct opcode group7_rm3[] = {
- DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
- II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
- DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
- DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
- DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
- DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
+ DIP(SrcNone | Prot | Priv, vmrun, check_svme_pa),
+ II(SrcNone | Prot | VendorSpecific, em_vmmcall, vmmcall),
+ DIP(SrcNone | Prot | Priv, vmload, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, vmsave, check_svme_pa),
+ DIP(SrcNone | Prot | Priv, stgi, check_svme),
+ DIP(SrcNone | Prot | Priv, clgi, check_svme),
+ DIP(SrcNone | Prot | Priv, skinit, check_svme),
+ DIP(SrcNone | Prot | Priv, invlpga, check_svme),
};
static struct opcode group7_rm7[] = {
N,
- DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
+ DIP(SrcNone, rdtscp, check_rdtsc),
N, N, N, N, N, N,
};
@@ -3341,81 +3554,86 @@ static struct opcode group1[] = {
};
static struct opcode group1A[] = {
- I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
};
static struct opcode group3[] = {
- I(DstMem | SrcImm | ModRM, em_test),
- I(DstMem | SrcImm | ModRM, em_test),
- I(DstMem | SrcNone | ModRM | Lock, em_not),
- I(DstMem | SrcNone | ModRM | Lock, em_neg),
- I(SrcMem | ModRM, em_mul_ex),
- I(SrcMem | ModRM, em_imul_ex),
- I(SrcMem | ModRM, em_div_ex),
- I(SrcMem | ModRM, em_idiv_ex),
+ I(DstMem | SrcImm, em_test),
+ I(DstMem | SrcImm, em_test),
+ I(DstMem | SrcNone | Lock, em_not),
+ I(DstMem | SrcNone | Lock, em_neg),
+ I(SrcMem, em_mul_ex),
+ I(SrcMem, em_imul_ex),
+ I(SrcMem, em_div_ex),
+ I(SrcMem, em_idiv_ex),
};
static struct opcode group4[] = {
- I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+ I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
+ I(ByteOp | DstMem | SrcNone | Lock, em_grp45),
N, N, N, N, N, N,
};
static struct opcode group5[] = {
- I(DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(DstMem | SrcNone | ModRM | Lock, em_grp45),
- I(SrcMem | ModRM | Stack, em_grp45),
- I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
- I(SrcMem | ModRM | Stack, em_grp45),
- I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
- I(SrcMem | ModRM | Stack, em_grp45), N,
+ I(DstMem | SrcNone | Lock, em_grp45),
+ I(DstMem | SrcNone | Lock, em_grp45),
+ I(SrcMem | Stack, em_grp45),
+ I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
+ I(SrcMem | Stack, em_grp45),
+ I(SrcMemFAddr | ImplicitOps, em_grp45),
+ I(SrcMem | Stack, em_grp45), N,
};
static struct opcode group6[] = {
- DI(ModRM | Prot, sldt),
- DI(ModRM | Prot, str),
- DI(ModRM | Prot | Priv, lldt),
- DI(ModRM | Prot | Priv, ltr),
+ DI(Prot, sldt),
+ DI(Prot, str),
+ II(Prot | Priv | SrcMem16, em_lldt, lldt),
+ II(Prot | Priv | SrcMem16, em_ltr, ltr),
N, N, N, N,
};
static struct group_dual group7 = { {
- DI(ModRM | Mov | DstMem | Priv, sgdt),
- DI(ModRM | Mov | DstMem | Priv, sidt),
- II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
- II(ModRM | SrcMem | Priv, em_lidt, lidt),
- II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
- II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
- II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
+ II(Mov | DstMem | Priv, em_sgdt, sgdt),
+ II(Mov | DstMem | Priv, em_sidt, sidt),
+ II(SrcMem | Priv, em_lgdt, lgdt),
+ II(SrcMem | Priv, em_lidt, lidt),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
}, {
- I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
+ I(SrcNone | Priv | VendorSpecific, em_vmcall),
EXT(0, group7_rm1),
N, EXT(0, group7_rm3),
- II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
- II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
+ II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
+ II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
+ EXT(0, group7_rm7),
} };
static struct opcode group8[] = {
N, N, N, N,
- I(DstMem | SrcImmByte | ModRM, em_bt),
- I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
- I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
- I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
+ I(DstMem | SrcImmByte, em_bt),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_bts),
+ I(DstMem | SrcImmByte | Lock, em_btr),
+ I(DstMem | SrcImmByte | Lock | PageTable, em_btc),
};
static struct group_dual group9 = { {
- N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+ N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
N, N, N, N, N, N, N, N,
} };
static struct opcode group11[] = {
- I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+ I(DstMem | SrcImm | Mov | PageTable, em_mov),
X7(D(Undefined)),
};
static struct gprefix pfx_0f_6f_0f_7f = {
- N, N, N, I(Sse, em_movdqu),
+ I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
+};
+
+static struct gprefix pfx_vmovntpx = {
+ I(0, em_mov), N, N, N,
};
static struct opcode opcode_table[256] = {
@@ -3464,10 +3682,10 @@ static struct opcode opcode_table[256] = {
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
/* 0x80 - 0x87 */
- G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
- G(DstMem | SrcImm | ModRM | Group, group1),
- G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
- G(DstMem | SrcImmByte | ModRM | Group, group1),
+ G(ByteOp | DstMem | SrcImm, group1),
+ G(DstMem | SrcImm, group1),
+ G(ByteOp | DstMem | SrcImm | No64, group1),
+ G(DstMem | SrcImmByte, group1),
I2bv(DstMem | SrcReg | ModRM, em_test),
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
/* 0x88 - 0x8F */
@@ -3483,7 +3701,7 @@ static struct opcode opcode_table[256] = {
D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
I(SrcImmFAddr | No64, em_call_far), N,
II(ImplicitOps | Stack, em_pushf, pushf),
- II(ImplicitOps | Stack, em_popf, popf), N, N,
+ II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3506,7 +3724,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
- N, N, N, I(ImplicitOps | Stack, em_ret_far),
+ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
+ N, I(ImplicitOps | Stack, em_ret_far),
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
@@ -3549,7 +3768,8 @@ static struct opcode twobyte_table[256] = {
IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
N, N, N, N,
- N, N, N, N, N, N, N, N,
+ N, N, N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+ N, N, N, N,
/* 0x30 - 0x3F */
II(ImplicitOps | Priv, em_wrmsr, wrmsr),
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
@@ -3579,7 +3799,7 @@ static struct opcode twobyte_table[256] = {
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
+ II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
@@ -3602,11 +3822,12 @@ static struct opcode twobyte_table[256] = {
I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
- /* 0xC0 - 0xCF */
+ /* 0xC0 - 0xC7 */
D2bv(DstMem | SrcReg | ModRM | Lock),
N, D(DstMem | SrcReg | ModRM | Mov),
N, N, N, GD(0, &group9),
- N, N, N, N, N, N, N, N,
+ /* 0xC8 - 0xCF */
+ X8(I(DstReg, em_bswap)),
/* 0xD0 - 0xDF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
@@ -3897,17 +4118,16 @@ done_prefixes:
}
ctxt->d = opcode.flags;
+ if (ctxt->d & ModRM)
+ ctxt->modrm = insn_fetch(u8, ctxt);
+
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
- ctxt->modrm = insn_fetch(u8, ctxt);
- --ctxt->_eip;
goffset = (ctxt->modrm >> 3) & 7;
opcode = opcode.u.group[goffset];
break;
case GroupDual:
- ctxt->modrm = insn_fetch(u8, ctxt);
- --ctxt->_eip;
goffset = (ctxt->modrm >> 3) & 7;
if ((ctxt->modrm >> 6) == 3)
opcode = opcode.u.gdual->mod3[goffset];
@@ -3960,6 +4180,8 @@ done_prefixes:
if (ctxt->d & Sse)
ctxt->op_bytes = 16;
+ else if (ctxt->d & Mmx)
+ ctxt->op_bytes = 8;
/* ModRM and SIB bytes. */
if (ctxt->d & ModRM) {
@@ -4030,6 +4252,35 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
return false;
}
+static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
+{
+ bool fault = false;
+
+ ctxt->ops->get_fpu(ctxt);
+ asm volatile("1: fwait \n\t"
+ "2: \n\t"
+ ".pushsection .fixup,\"ax\" \n\t"
+ "3: \n\t"
+ "movb $1, %[fault] \n\t"
+ "jmp 2b \n\t"
+ ".popsection \n\t"
+ _ASM_EXTABLE(1b, 3b)
+ : [fault]"+qm"(fault));
+ ctxt->ops->put_fpu(ctxt);
+
+ if (unlikely(fault))
+ return emulate_exception(ctxt, MF_VECTOR, 0, false);
+
+ return X86EMUL_CONTINUE;
+}
+
+static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
+ struct operand *op)
+{
+ if (op->type == OP_MM)
+ read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+}
+
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
struct x86_emulate_ops *ops = ctxt->ops;
@@ -4054,18 +4305,31 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if ((ctxt->d & Sse)
- && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
- || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
+ if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)))
+ || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
rc = emulate_ud(ctxt);
goto done;
}
- if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+ if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
rc = emulate_nm(ctxt);
goto done;
}
+ if (ctxt->d & Mmx) {
+ rc = flush_pending_x87_faults(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
+ /*
+ * Now that we know the fpu is exception safe, we can fetch
+ * operands from it.
+ */
+ fetch_possible_mmx_operand(ctxt, &ctxt->src);
+ fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ if (!(ctxt->d & Mov))
+ fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ }
+
if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
@@ -4327,12 +4591,12 @@ twobyte_insn:
break;
case 0xb6 ... 0xb7: /* movzx */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
: (u16) ctxt->src.val;
break;
case 0xbe ... 0xbf: /* movsx */
ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+ ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
(s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index d68f99df690c..adba28f88d1a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,7 +34,6 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
-#include <linux/workqueue.h>
#include "irq.h"
#include "i8254.h"
@@ -249,7 +248,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
*/
- queue_work(ps->pit->wq, &ps->pit->expired);
+ queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
ps->irq_ack = 1;
spin_unlock(&ps->inject_lock);
}
@@ -270,7 +269,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
static void destroy_pit_timer(struct kvm_pit *pit)
{
hrtimer_cancel(&pit->pit_state.pit_timer.timer);
- cancel_work_sync(&pit->expired);
+ flush_kthread_work(&pit->expired);
}
static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -284,7 +283,7 @@ static struct kvm_timer_ops kpit_ops = {
.is_periodic = kpit_is_periodic,
};
-static void pit_do_work(struct work_struct *work)
+static void pit_do_work(struct kthread_work *work)
{
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
struct kvm *kvm = pit->kvm;
@@ -328,7 +327,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
- queue_work(pt->wq, &pt->expired);
+ queue_kthread_work(&pt->worker, &pt->expired);
}
if (ktimer->t_ops->is_periodic(ktimer)) {
@@ -353,7 +352,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
- cancel_work_sync(&ps->pit->expired);
+ flush_kthread_work(&ps->pit->expired);
pt->period = interval;
ps->is_periodic = is_period;
@@ -669,6 +668,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
struct kvm_kpit_state *pit_state;
+ struct pid *pid;
+ pid_t pid_nr;
int ret;
pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
@@ -685,14 +686,20 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_lock(&pit->pit_state.lock);
spin_lock_init(&pit->pit_state.inject_lock);
- pit->wq = create_singlethread_workqueue("kvm-pit-wq");
- if (!pit->wq) {
+ pid = get_pid(task_tgid(current));
+ pid_nr = pid_vnr(pid);
+ put_pid(pid);
+
+ init_kthread_worker(&pit->worker);
+ pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
+ "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker_task)) {
mutex_unlock(&pit->pit_state.lock);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
return NULL;
}
- INIT_WORK(&pit->expired, pit_do_work);
+ init_kthread_work(&pit->expired, pit_do_work);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -736,7 +743,7 @@ fail:
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
- destroy_workqueue(pit->wq);
+ kthread_stop(pit->worker_task);
kfree(pit);
return NULL;
}
@@ -756,10 +763,10 @@ void kvm_free_pit(struct kvm *kvm)
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
- cancel_work_sync(&kvm->arch.vpit->expired);
+ flush_kthread_work(&kvm->arch.vpit->expired);
+ kthread_stop(kvm->arch.vpit->worker_task);
kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
- destroy_workqueue(kvm->arch.vpit->wq);
kfree(kvm->arch.vpit);
}
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 51a97426e791..fdf40425ea1d 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -1,6 +1,8 @@
#ifndef __I8254_H
#define __I8254_H
+#include <linux/kthread.h>
+
#include "iodev.h"
struct kvm_kpit_channel_state {
@@ -39,8 +41,9 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
- struct workqueue_struct *wq;
- struct work_struct expired;
+ struct kthread_worker worker;
+ struct task_struct *worker_task;
+ struct kthread_work expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 81cf4fa4a2be..1df8fb9e1d5d 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s)
pic_unlock(s);
}
-int kvm_pic_set_irq(void *opaque, int irq, int level)
+int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level)
{
- struct kvm_pic *s = opaque;
int ret = -1;
pic_lock(s);
if (irq >= 0 && irq < PIC_NUM_PINS) {
- ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
+ int irq_level = __kvm_irq_line_state(&s->irq_states[irq],
+ irq_source_id, level);
+ ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
s->pics[irq >> 3].imr, ret == 0);
@@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
return ret;
}
+void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id)
+{
+ int i;
+
+ pic_lock(s);
+ for (i = 0; i < PIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &s->irq_states[i]);
+ pic_unlock(s);
+}
+
/*
* acknowledge interrupt 'irq'
*/
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 858432287ab6..ce878788a39f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -92,6 +92,11 @@ static inline int apic_test_and_clear_vector(int vec, void *bitmap)
return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+static inline int apic_test_vector(int vec, void *bitmap)
+{
+ return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
static inline void apic_set_vector(int vec, void *bitmap)
{
set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -102,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap)
clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
}
+static inline int __apic_test_and_set_vector(int vec, void *bitmap)
+{
+ return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
+static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
+{
+ return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
+}
+
static inline int apic_hw_enabled(struct kvm_lapic *apic)
{
return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE;
@@ -205,6 +220,16 @@ static int find_highest_vector(void *bitmap)
return fls(word[word_offset << 2]) - 1 + (word_offset << 5);
}
+static u8 count_vectors(void *bitmap)
+{
+ u32 *word = bitmap;
+ int word_offset;
+ u8 count = 0;
+ for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset)
+ count += hweight32(word[word_offset << 2]);
+ return count;
+}
+
static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
{
apic->irr_pending = true;
@@ -237,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
apic->irr_pending = true;
}
+static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
+{
+ if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+}
+
+static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
+{
+ if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR))
+ --apic->isr_count;
+ BUG_ON(apic->isr_count < 0);
+ apic->highest_isr_cache = -1;
+}
+
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -265,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
irq->level, irq->trig_mode);
}
+static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
+}
+
+static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
+{
+
+ return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
+}
+
+static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED;
+}
+
+static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
+{
+ u8 val;
+ if (pv_eoi_get_user(vcpu, &val) < 0)
+ apic_debug("Can't read EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return val & 0x1;
+}
+
+static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
+ apic_debug("Can't set EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return;
+ }
+ __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
+static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
+{
+ if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) {
+ apic_debug("Can't clear EOI MSR value: 0x%llx\n",
+ (unsigned long long)vcpi->arch.pv_eoi.msr_val);
+ return;
+ }
+ __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
+}
+
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
{
int result;
+ if (!apic->isr_count)
+ return -1;
+ if (likely(apic->highest_isr_cache != -1))
+ return apic->highest_isr_cache;
result = find_highest_vector(apic->regs + APIC_ISR);
ASSERT(result == -1 || result >= 16);
@@ -477,27 +575,33 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
}
-static void apic_set_eoi(struct kvm_lapic *apic)
+static int apic_set_eoi(struct kvm_lapic *apic)
{
int vector = apic_find_highest_isr(apic);
- int trigger_mode;
+
+ trace_kvm_eoi(apic, vector);
+
/*
* Not every write EOI will has corresponding ISR,
* one example is when Kernel check timer on setup_IO_APIC
*/
if (vector == -1)
- return;
+ return vector;
- apic_clear_vector(vector, apic->regs + APIC_ISR);
+ apic_clear_isr(vector, apic);
apic_update_ppr(apic);
- if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
- trigger_mode = IOAPIC_LEVEL_TRIG;
- else
- trigger_mode = IOAPIC_EDGE_TRIG;
- if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
+ if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
+ kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+ int trigger_mode;
+ if (apic_test_vector(vector, apic->regs + APIC_TMR))
+ trigger_mode = IOAPIC_LEVEL_TRIG;
+ else
+ trigger_mode = IOAPIC_EDGE_TRIG;
kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+ }
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ return vector;
}
static void apic_send_ipi(struct kvm_lapic *apic)
@@ -1074,13 +1178,17 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
apic->irr_pending = false;
+ apic->isr_count = 0;
+ apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
if (kvm_vcpu_is_bsp(vcpu))
vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
vcpu->arch.apic_arb_prio = 0;
+ vcpu->arch.apic_attention = 0;
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
@@ -1240,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
if (vector == -1)
return -1;
- apic_set_vector(vector, apic->regs + APIC_ISR);
+ apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
return vector;
@@ -1259,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ apic->highest_isr_cache = -1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -1275,12 +1385,52 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
+/*
+ * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
+ *
+ * Detect whether guest triggered PV EOI since the
+ * last entry. If yes, set EOI on guests's behalf.
+ * Clear PV EOI in guest memory in any case.
+ */
+static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ bool pending;
+ int vector;
+ /*
+ * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
+ * and KVM_PV_EOI_ENABLED in guest memory as follows:
+ *
+ * KVM_APIC_PV_EOI_PENDING is unset:
+ * -> host disabled PV EOI.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
+ * -> host enabled PV EOI, guest did not execute EOI yet.
+ * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
+ * -> host enabled PV EOI, guest executed EOI.
+ */
+ BUG_ON(!pv_eoi_enabled(vcpu));
+ pending = pv_eoi_get_pending(vcpu);
+ /*
+ * Clear pending bit in any case: it will be set again on vmentry.
+ * While this might not be ideal from performance point of view,
+ * this makes sure pv eoi is only enabled when we know it's safe.
+ */
+ pv_eoi_clr_pending(vcpu);
+ if (pending)
+ return;
+ vector = apic_set_eoi(apic);
+ trace_kvm_pv_eoi(apic, vector);
+}
+
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
{
u32 data;
void *vapic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention))
+ apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
vapic = kmap_atomic(vcpu->arch.apic->vapic_page);
@@ -1290,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
+/*
+ * apic_sync_pv_eoi_to_guest - called before vmentry
+ *
+ * Detect whether it's safe to enable PV EOI and
+ * if yes do so.
+ */
+static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
+ struct kvm_lapic *apic)
+{
+ if (!pv_eoi_enabled(vcpu) ||
+ /* IRR set or many bits in ISR: could be nested. */
+ apic->irr_pending ||
+ /* Cache not set: could be safe but we don't bother. */
+ apic->highest_isr_cache == -1 ||
+ /* Need EOI to update ioapic. */
+ kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+ /*
+ * PV EOI was disabled by apic_sync_pv_eoi_from_guest
+ * so we need not do anything here.
+ */
+ return;
+ }
+
+ pv_eoi_set_pending(apic->vcpu);
+}
+
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
{
u32 data, tpr;
int max_irr, max_isr;
- struct kvm_lapic *apic;
+ struct kvm_lapic *apic = vcpu->arch.apic;
void *vapic;
- if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr)
+ apic_sync_pv_eoi_to_guest(vcpu, apic);
+
+ if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- apic = vcpu->arch.apic;
tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff;
max_irr = apic_find_highest_irr(apic);
if (max_irr < 0)
@@ -1317,10 +1494,11 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
vcpu->arch.apic->vapic_addr = vapic_addr;
+ if (vapic_addr)
+ __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
+ else
+ __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention);
}
int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
@@ -1385,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
return 0;
}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
+{
+ u64 addr = data & ~KVM_MSR_ENABLED;
+ if (!IS_ALIGNED(addr, 4))
+ return 1;
+
+ vcpu->arch.pv_eoi.msr_val = data;
+ if (!pv_eoi_enabled(vcpu))
+ return 0;
+ return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ addr);
+}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6f4ce2575d09..4af5405ae1e2 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,6 +13,15 @@ struct kvm_lapic {
u32 divide_count;
struct kvm_vcpu *vcpu;
bool irr_pending;
+ /* Number of bits set in ISR. */
+ s16 isr_count;
+ /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
+ int highest_isr_cache;
+ /**
+ * APIC register page. The layout matches the register layout seen by
+ * the guest 1:1, because it is accessed by the vmx microcode.
+ * Note: Only one register, the TPR, is used by the microcode.
+ */
void *regs;
gpa_t vapic_addr;
struct page *vapic_page;
@@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
}
+
+int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4cb164268846..01ca00423938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
#define PTE_PREFETCH_NUM 8
-#define PT_FIRST_AVAIL_BITS_SHIFT 9
+#define PT_FIRST_AVAIL_BITS_SHIFT 10
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
#define PT64_LEVEL_BITS 9
@@ -135,8 +135,6 @@ module_param(dbg, bool, 0644);
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
| PT64_NX_MASK)
-#define PTE_LIST_EXT 4
-
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
#define ACC_USER_MASK PT_USER_MASK
@@ -147,10 +145,14 @@ module_param(dbg, bool, 0644);
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
+/* make pte_list_desc fit well in cache line */
+#define PTE_LIST_EXT 3
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -187,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
+static void mmu_free_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
@@ -443,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
}
#endif
+static bool spte_is_locklessly_modifiable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
static bool spte_has_volatile_bits(u64 spte)
{
+ /*
+ * Always atomicly update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (spte_is_locklessly_modifiable(spte))
+ return true;
+
if (!shadow_accessed_mask)
return false;
@@ -477,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
*/
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
- u64 mask, old_spte = *sptep;
+ u64 old_spte = *sptep;
+ bool ret = false;
WARN_ON(!is_rmap_spte(new_spte));
- if (!is_shadow_present_pte(old_spte))
- return mmu_spte_set(sptep, new_spte);
-
- new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return ret;
+ }
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+ if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomicly update it, see the comments in
+ * spte_has_volatile_bits().
+ */
+ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ ret = true;
+
if (!shadow_accessed_mask)
- return;
+ return ret;
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+ return ret;
}
/*
@@ -550,19 +580,29 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- rcu_read_lock();
- atomic_inc(&vcpu->kvm->arch.reader_counter);
-
- /* Increase the counter before walking shadow page table */
- smp_mb__after_atomic_inc();
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
+ vcpu->mode = READING_SHADOW_PAGE_TABLES;
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_mb();
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- /* Decrease the counter after walking shadow page table finished */
- smp_mb__before_atomic_dec();
- atomic_dec(&vcpu->kvm->arch.reader_counter);
- rcu_read_unlock();
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_mb();
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+ local_irq_enable();
}
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -641,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
mmu_page_header_cache);
}
-static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
- size_t size)
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
{
void *p;
@@ -653,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
{
- return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
- sizeof(struct pte_list_desc));
+ return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
}
static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -841,32 +879,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
return count;
}
-static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
-{
- struct pte_list_desc *desc;
- u64 *prev_spte;
- int i;
-
- if (!*pte_list)
- return NULL;
- else if (!(*pte_list & 1)) {
- if (!spte)
- return (u64 *)*pte_list;
- return NULL;
- }
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- prev_spte = NULL;
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
- if (prev_spte == spte)
- return desc->sptes[i];
- prev_spte = desc->sptes[i];
- }
- desc = desc->more;
- }
- return NULL;
-}
-
static void
pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
int i, struct pte_list_desc *prev_desc)
@@ -987,11 +999,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
return pte_list_add(vcpu, spte, rmapp);
}
-static u64 *rmap_next(unsigned long *rmapp, u64 *spte)
-{
- return pte_list_next(rmapp, spte);
-}
-
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
@@ -1004,106 +1011,248 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pte_list_remove(spte, rmapp);
}
+/*
+ * Used by the following functions to iterate through the sptes linked by a
+ * rmap. All fields are private and not assumed to be used outside.
+ */
+struct rmap_iterator {
+ /* private fields */
+ struct pte_list_desc *desc; /* holds the sptep if not NULL */
+ int pos; /* index of the sptep */
+};
+
+/*
+ * Iteration must be started by this function. This should also be used after
+ * removing/dropping sptes from the rmap link because in such cases the
+ * information in the itererator may not be valid.
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+{
+ if (!rmap)
+ return NULL;
+
+ if (!(rmap & 1)) {
+ iter->desc = NULL;
+ return (u64 *)rmap;
+ }
+
+ iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+ iter->pos = 0;
+ return iter->desc->sptes[iter->pos];
+}
+
+/*
+ * Must be used with a valid iterator: e.g. after rmap_get_first().
+ *
+ * Returns sptep if found, NULL otherwise.
+ */
+static u64 *rmap_get_next(struct rmap_iterator *iter)
+{
+ if (iter->desc) {
+ if (iter->pos < PTE_LIST_EXT - 1) {
+ u64 *sptep;
+
+ ++iter->pos;
+ sptep = iter->desc->sptes[iter->pos];
+ if (sptep)
+ return sptep;
+ }
+
+ iter->desc = iter->desc->more;
+
+ if (iter->desc) {
+ iter->pos = 0;
+ /* desc->sptes[0] cannot be NULL */
+ return iter->desc->sptes[iter->pos];
+ }
+ }
+
+ return NULL;
+}
+
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
if (mmu_spte_clear_track_bits(sptep))
rmap_remove(kvm, sptep);
}
-int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
- struct kvm_memory_slot *slot)
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
{
- unsigned long *rmapp;
- u64 *spte;
- int i, write_protected = 0;
-
- rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- if (is_writable_pte(*spte)) {
- mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
- write_protected = 1;
- }
- spte = rmap_next(rmapp, spte);
+ if (is_large_pte(*sptep)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
}
- /* check for huge page mappings */
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- rmapp = __gfn_to_rmap(gfn, i, slot);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON(!is_large_pte(*spte));
- pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
- if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte);
- --kvm->stat.lpages;
- spte = NULL;
- write_protected = 1;
- }
- spte = rmap_next(rmapp, spte);
+ return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (__drop_large_spte(vcpu->kvm, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte writ-protection is caused by protecting shadow page table.
+ * @flush indicates whether tlb need be flushed.
+ *
+ * Note: write protection is difference between drity logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
+ *
+ * Return true if the spte is dropped.
+ */
+static bool
+spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_is_locklessly_modifiable(spte)))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ if (__drop_large_spte(kvm, sptep)) {
+ *flush |= true;
+ return true;
+ }
+
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
+ spte = spte & ~PT_WRITABLE_MASK;
+
+ *flush |= mmu_spte_update(sptep, spte);
+ return false;
+}
+
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+ int level, bool pt_protect)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
+ sptep = rmap_get_first(*rmapp, &iter);
+ continue;
}
+
+ sptep = rmap_get_next(&iter);
}
- return write_protected;
+ return flush;
}
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+/**
+ * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to protect
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should protect
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ unsigned long *rmapp;
+
+ while (mask) {
+ rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
+ __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ int i;
+ bool write_protected = false;
slot = gfn_to_memslot(kvm, gfn);
- return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ rmapp = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+ }
+
+ return write_protected;
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
- u64 *spte;
+ u64 *sptep;
+ struct rmap_iterator iter;
int need_tlb_flush = 0;
- while ((spte = rmap_next(rmapp, NULL))) {
- BUG_ON(!(*spte & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte);
+ while ((sptep = rmap_get_first(*rmapp, &iter))) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+
+ drop_spte(kvm, sptep);
need_tlb_flush = 1;
}
+
return need_tlb_flush;
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
+ u64 *sptep;
+ struct rmap_iterator iter;
int need_flush = 0;
- u64 *spte, new_spte;
+ u64 new_spte;
pte_t *ptep = (pte_t *)data;
pfn_t new_pfn;
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- BUG_ON(!is_shadow_present_pte(*spte));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+
need_flush = 1;
+
if (pte_write(*ptep)) {
- drop_spte(kvm, spte);
- spte = rmap_next(rmapp, NULL);
+ drop_spte(kvm, sptep);
+ sptep = rmap_get_first(*rmapp, &iter);
} else {
- new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
+ new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
new_spte &= ~shadow_accessed_mask;
- mmu_spte_clear_track_bits(spte);
- mmu_spte_set(spte, new_spte);
- spte = rmap_next(rmapp, spte);
+
+ mmu_spte_clear_track_bits(sptep);
+ mmu_spte_set(sptep, new_spte);
+ sptep = rmap_get_next(&iter);
}
}
+
if (need_flush)
kvm_flush_remote_tlbs(kvm);
@@ -1162,11 +1311,13 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
- u64 *spte;
+ u64 *sptep;
+ struct rmap_iterator uninitialized_var(iter);
int young = 0;
/*
- * Emulate the accessed bit for EPT, by checking if this page has
+ * In case of absence of EPT Access and Dirty Bits supports,
+ * emulate the accessed bit for EPT, by checking if this page has
* an EPT mapping, and clearing it if it does. On the next access,
* a new EPT mapping will be established.
* This has some overhead, but not as much as the cost of swapping
@@ -1175,25 +1326,25 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
return kvm_unmap_rmapp(kvm, rmapp, data);
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- int _young;
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- _young = _spte & PT_ACCESSED_MASK;
- if (_young) {
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+ sptep = rmap_get_next(&iter)) {
+ BUG_ON(!is_shadow_present_pte(*sptep));
+
+ if (*sptep & shadow_accessed_mask) {
young = 1;
- clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
}
- spte = rmap_next(rmapp, spte);
}
+
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
- u64 *spte;
+ u64 *sptep;
+ struct rmap_iterator iter;
int young = 0;
/*
@@ -1204,16 +1355,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- u64 _spte = *spte;
- BUG_ON(!(_spte & PT_PRESENT_MASK));
- young = _spte & PT_ACCESSED_MASK;
- if (young) {
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+ sptep = rmap_get_next(&iter)) {
+ BUG_ON(!is_shadow_present_pte(*sptep));
+
+ if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
- spte = rmap_next(rmapp, spte);
}
out:
return young;
@@ -1328,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
- sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
- sizeof *sp);
- sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
+ sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
if (!direct)
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
- PAGE_SIZE);
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1628,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
+ bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1793,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
mmu_spte_set(sptep, spte);
}
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep);
- --vcpu->kvm->stat.lpages;
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned direct_access)
{
@@ -1865,10 +2003,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- u64 *parent_pte;
+ u64 *sptep;
+ struct rmap_iterator iter;
- while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
- drop_parent_pte(sp, parent_pte);
+ while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+ drop_parent_pte(sp, sptep);
}
static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1925,30 +2064,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}
-static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
-{
- struct kvm_mmu_page *sp;
-
- list_for_each_entry(sp, invalid_list, link)
- kvm_mmu_isolate_page(sp);
-}
-
-static void free_pages_rcu(struct rcu_head *head)
-{
- struct kvm_mmu_page *next, *sp;
-
- sp = container_of(head, struct kvm_mmu_page, rcu);
- while (sp) {
- if (!list_empty(&sp->link))
- next = list_first_entry(&sp->link,
- struct kvm_mmu_page, link);
- else
- next = NULL;
- kvm_mmu_free_page(sp);
- sp = next;
- }
-}
-
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
@@ -1957,17 +2072,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
if (list_empty(invalid_list))
return;
- kvm_flush_remote_tlbs(kvm);
-
- if (atomic_read(&kvm->arch.reader_counter)) {
- kvm_mmu_isolate_pages(invalid_list);
- sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
- list_del_init(invalid_list);
+ /*
+ * wmb: make sure everyone sees our modifications to the page tables
+ * rmb: make sure we see changes to vcpu->mode
+ */
+ smp_mb();
- trace_kvm_mmu_delay_free_pages(sp);
- call_rcu(&sp->rcu, free_pages_rcu);
- return;
- }
+ /*
+ * Wait for all vcpus to exit guest mode and/or lockless shadow
+ * page table walks.
+ */
+ kvm_flush_remote_tlbs(kvm);
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
@@ -1975,7 +2090,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
-
}
/*
@@ -2194,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte, entry = *sptep;
+ u64 spte;
int ret = 0;
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2208,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_x_mask;
else
spte |= shadow_nx_mask;
+
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
+
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
@@ -2234,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto done;
}
- spte |= PT_WRITABLE_MASK;
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
if (!vcpu->arch.mmu.direct_map
&& !(pte_access & ACC_WRITE_MASK)) {
@@ -2263,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
}
@@ -2272,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- mmu_spte_update(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
@@ -2354,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
+ mmu_free_roots(vcpu);
}
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2546,8 +2655,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
*gfnp = gfn;
kvm_release_pfn_clean(pfn);
pfn &= ~mask;
- if (!get_page_unless_zero(pfn_to_page(pfn)))
- BUG();
+ kvm_get_pfn(pfn);
*pfnp = pfn;
}
}
@@ -2577,18 +2685,116 @@ exit:
return ret;
}
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ /*
+ * #PF can be fast only if the shadow page table is present and it
+ * is caused by write-protect, that means we just need change the
+ * W bit of the spte which can be done out of mmu-lock.
+ */
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ return true;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * The gfn of direct spte is stable since it is calculated
+ * by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret = false;
+ u64 spte = 0ull;
+
+ if (!page_fault_can_be_fast(vcpu, error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ if (!is_shadow_present_pte(spte) || iterator.level < level)
+ break;
+
+ /*
+ * If the mapping has been changed, let the vcpu fault on the
+ * same address again.
+ */
+ if (!is_rmap_spte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ if (!is_last_spte(spte, level))
+ goto exit;
+
+ /*
+ * Check if it is a spurious fault caused by TLB lazily flushed.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_writable_pte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ /*
+ * Currently, to simplify the code, only the spte write-protected
+ * by dirty-log can be fast fixed.
+ */
+ if (!spte_is_locklessly_modifiable(spte))
+ goto exit;
+
+ /*
+ * Currently, fast page fault only works for direct mapping since
+ * the gfn is not stable for indirect shadow page.
+ * See Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+exit:
+ trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ spte, ret);
+ walk_shadow_page_lockless_end(vcpu);
+
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+ gfn_t gfn, bool prefault)
{
int r;
int level;
int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
@@ -2605,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, v, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -2993,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
+ error_code, gfn, prefault);
}
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3073,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -3554,7 +3766,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp)
* Skip write-flooding detected for the sp whose level is 1, because
* it can become unsync, then the guest page is not write-protected.
*/
- if (sp->role.level == 1)
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return false;
return ++sp->write_flooding_count >= 3;
@@ -3837,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ bool flush = false;
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
@@ -3851,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i]);
- --kvm->stat.lpages;
- continue;
- }
-
- /* avoid RMW */
- if (is_writable_pte(pt[i]))
- mmu_spte_update(&pt[i],
- pt[i] & ~PT_WRITABLE_MASK);
+ spte_write_protect(kvm, &pt[i], &flush, false);
}
}
kvm_flush_remote_tlbs(kvm);
@@ -3886,6 +4090,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
{
struct kvm_mmu_page *page;
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ return;
+
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
@@ -3894,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
{
struct kvm *kvm;
- struct kvm *kvm_freed = NULL;
int nr_to_scan = sc->nr_to_scan;
if (nr_to_scan == 0)
@@ -3906,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
int idx;
LIST_HEAD(invalid_list);
+ /*
+ * n_used_mmu_pages is accessed without holding kvm->mmu_lock
+ * here. We may skip a VM instance errorneosly, but we do not
+ * want to shrink a VM that only started to populate its MMU
+ * anyway.
+ */
+ if (kvm->arch.n_used_mmu_pages > 0) {
+ if (!nr_to_scan--)
+ break;
+ continue;
+ }
+
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- if (!kvm_freed && nr_to_scan > 0 &&
- kvm->arch.n_used_mmu_pages > 0) {
- kvm_mmu_remove_some_alloc_mmu_pages(kvm,
- &invalid_list);
- kvm_freed = kvm;
- }
- nr_to_scan--;
+ kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
+
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
+
+ list_move_tail(&kvm->vm_list, &vm_list);
+ break;
}
- if (kvm_freed)
- list_move_tail(&kvm_freed->vm_list, &vm_list);
raw_spin_unlock(&kvm_lock);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 715da5a19a5b..7d7d0b9e23eb 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
- u64 *spte;
+ u64 *sptep;
+ struct rmap_iterator iter;
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slot = gfn_to_memslot(kvm, sp->gfn);
rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
- spte = rmap_next(rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;
+ sptep = rmap_get_next(&iter)) {
+ if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
- spte = rmap_next(rmapp, spte);
}
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e81322a..cd6e98333ba3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
*/
TRACE_EVENT(
kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
TP_STRUCT__entry(
__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
+ __entry->pferr = pferr;
),
TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
@@ -243,6 +242,44 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
__entry->access)
);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gva_t, gva)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gva = gva;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->gva, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index df5a70311be8..bb7cf01cae76 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
+ trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
eperm = false;
walker->level = mmu->root_level;
@@ -658,7 +657,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
{
int offset = 0;
- WARN_ON(sp->role.level != 1);
+ WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2e88438ffd83..9b7ec1150ab0 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
{
- if (idx < X86_PMC_IDX_FIXED)
+ if (idx < INTEL_PMC_IDX_FIXED)
return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
else
- return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
+ return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
}
void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
@@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
if (pmc_is_gp(pmc))
reprogram_gp_counter(pmc, pmc->eventsel);
else {
- int fidx = idx - X86_PMC_IDX_FIXED;
+ int fidx = idx - INTEL_PMC_IDX_FIXED;
reprogram_fixed_counter(pmc,
fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
}
@@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
return;
pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
- X86_PMC_MAX_GENERIC);
+ INTEL_PMC_MAX_GENERIC);
pmu->counter_bitmask[KVM_PMC_GP] =
((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
bitmap_len = (entry->eax >> 24) & 0xff;
@@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters = 0;
} else {
pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
- X86_PMC_MAX_FIXED);
+ INTEL_PMC_MAX_FIXED);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
}
pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED);
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
}
@@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = &vcpu->arch.pmu;
memset(pmu, 0, sizeof(*pmu));
- for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
}
- for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
}
init_irq_work(&pmu->irq_work, trigger_pmi);
kvm_pmu_cpuid_update(vcpu);
@@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
int i;
irq_work_sync(&pmu->irq_work);
- for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
struct kvm_pmc *pmc = &pmu->gp_counters[i];
stop_counter(pmc);
pmc->counter = pmc->eventsel = 0;
}
- for (i = 0; i < X86_PMC_MAX_FIXED; i++)
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
stop_counter(&pmu->fixed_counters[i]);
pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e334389e1c75..baead950d6c8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -22,6 +22,7 @@
#include "x86.h"
#include <linux/module.h>
+#include <linux/mod_devicetable.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@@ -42,6 +43,12 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static const struct x86_cpu_id svm_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_SVM),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1
@@ -3178,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
break;
case MSR_IA32_DEBUGCTLMSR:
if (!boot_cpu_has(X86_FEATURE_LBRV)) {
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
break;
}
if (data & DEBUGCTL_RESERVED_BITS)
@@ -3198,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
- pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
default:
return kvm_set_msr_common(vcpu, ecx, data);
@@ -3240,6 +3247,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
+ ++svm->vcpu.stat.irq_window_exits;
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -3247,7 +3255,6 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
if (!irqchip_in_kernel(svm->vcpu.kvm) &&
kvm_run->request_interrupt_window &&
!kvm_cpu_has_interrupt(&svm->vcpu)) {
- ++svm->vcpu.stat.irq_window_exits;
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
return 0;
}
@@ -4037,6 +4044,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_invpcid_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -4305,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.cpuid_update = svm_cpuid_update,
.rdtscp_supported = svm_rdtscp_supported,
+ .invpcid_supported = svm_invpcid_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 911d2641f14c..a71faf727ff3 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry->coalesced ? " (coalesced)" : "")
);
+TRACE_EVENT(kvm_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
+TRACE_EVENT(kvm_pv_eoi,
+ TP_PROTO(struct kvm_lapic *apic, int vector),
+ TP_ARGS(apic, vector),
+
+ TP_STRUCT__entry(
+ __field( __u32, apicid )
+ __field( int, vector )
+ ),
+
+ TP_fast_assign(
+ __entry->apicid = apic->vcpu->vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector)
+);
+
/*
* Tracepoint for nested VMRUN
*/
@@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit,
__entry->rip, __entry->slb)
);
-#define __print_insn(insn, ilen) ({ \
- int i; \
- const char *ret = p->buffer + p->len; \
- \
- for (i = 0; i < ilen; ++i) \
- trace_seq_printf(p, " %02x", insn[i]); \
- trace_seq_printf(p, "%c", 0); \
- ret; \
- })
-
#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
#define KVM_EMUL_INSN_F_CS_D (1 << 2)
@@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn,
TP_printk("%x:%llx:%s (%s)%s",
__entry->csbase, __entry->rip,
- __print_insn(__entry->insn, __entry->len),
+ __print_hex(__entry->insn, __entry->len),
__print_symbolic(__entry->flags,
kvm_trace_symbol_emul_flags),
__entry->failed ? " failed" : ""
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ff0ab9bc3c8..c39b60707e02 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
+#include <linux/mod_devicetable.h>
#include <linux/ftrace_event.h>
#include <linux/slab.h>
#include <linux/tboot.h>
@@ -51,6 +52,12 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static const struct x86_cpu_id vmx_cpu_id[] = {
+ X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+
static bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
@@ -64,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
-static bool __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly enable_ept_ad_bits = 1;
+module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
+
+static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly vmm_exclusive = 1;
@@ -386,6 +396,9 @@ struct vcpu_vmx {
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
+#ifdef CONFIG_X86_64
+ u16 ds_sel, es_sel;
+#endif
int gs_ldt_reload_needed;
int fs_reload_needed;
} host_state;
@@ -605,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
+static void vmx_set_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
+static void vmx_get_segment(struct kvm_vcpu *vcpu,
+ struct kvm_segment *var, int seg);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -779,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void)
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}
+static inline bool cpu_has_vmx_ept_ad_bits(void)
+{
+ return vmx_capability.ept & VMX_EPT_AD_BIT;
+}
+
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -839,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void)
SECONDARY_EXEC_RDTSCP;
}
+static inline bool cpu_has_vmx_invpcid(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_INVPCID;
+}
+
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -1411,6 +1439,11 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
}
#ifdef CONFIG_X86_64
+ savesegment(ds, vmx->host_state.ds_sel);
+ savesegment(es, vmx->host_state.es_sel);
+#endif
+
+#ifdef CONFIG_X86_64
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
@@ -1450,6 +1483,19 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
}
if (vmx->host_state.fs_reload_needed)
loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+ if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
+ loadsegment(ds, vmx->host_state.ds_sel);
+ loadsegment(es, vmx->host_state.es_sel);
+ }
+#else
+ /*
+ * The sysexit path does not restore ds/es, so we must set them to
+ * a reasonable value ourselves.
+ */
+ loadsegment(ds, __USER_DS);
+ loadsegment(es, __USER_DS);
+#endif
reload_tss();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
@@ -1711,6 +1757,11 @@ static bool vmx_rdtscp_supported(void)
return cpu_has_vmx_rdtscp();
}
+static bool vmx_invpcid_supported(void)
+{
+ return cpu_has_vmx_invpcid() && enable_ept;
+}
+
/*
* Swap MSR entry in host/guest MSR entry array.
*/
@@ -2430,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_INVPCID;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2617,8 +2669,12 @@ static __init int hardware_setup(void)
!cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
+ enable_ept_ad_bits = 0;
}
+ if (!cpu_has_vmx_ept_ad_bits())
+ enable_ept_ad_bits = 0;
+
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
@@ -2742,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_segment var;
if (enable_unrestricted_guest)
return;
@@ -2785,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
if (emulate_invalid_guest_state)
goto continue_rmode;
- vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
- vmcs_write32(GUEST_SS_LIMIT, 0xffff);
- vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_SS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_SS);
- vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
- vmcs_write32(GUEST_CS_LIMIT, 0xffff);
- if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
- vmcs_writel(GUEST_CS_BASE, 0xf0000);
- vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_CS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_CS);
- fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
- fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
- fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
- fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ vmx_get_segment(vcpu, &var, VCPU_SREG_ES);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_ES);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_DS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_DS);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_GS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_GS);
+
+ vmx_get_segment(vcpu, &var, VCPU_SREG_FS);
+ vmx_set_segment(vcpu, &var, VCPU_SREG_FS);
continue_rmode:
kvm_mmu_reset_context(vcpu);
@@ -2999,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa)
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
+ if (enable_ept_ad_bits)
+ eptp |= VMX_EPT_AD_ENABLE_BIT;
eptp |= (root_hpa & PAGE_MASK);
return eptp;
@@ -3125,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations
+ * fail; use the cache instead.
+ */
+ if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) {
+ return vmx->cpl;
+ }
+
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
+ vmx->cpl = __vmx_get_cpl(vcpu);
}
- return to_vmx(vcpu)->cpl;
+
+ return vmx->cpl;
}
@@ -3137,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
- if (var->unusable)
+ if (var->unusable || !var->present)
ar = 1 << 16;
else {
ar = var->type & 15;
@@ -3149,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
ar |= (var->db & 1) << 14;
ar |= (var->g & 1) << 15;
}
- if (ar == 0) /* a 0 value means unusable */
- ar = AR_UNUSABLE_MASK;
return ar;
}
@@ -3201,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
vmcs_write32(sf->ar_bytes, ar);
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
+
+ /*
+ * Fix segments for real mode guest in hosts that don't have
+ * "unrestricted_mode" or it was disabled.
+ * This is done to allow migration of the guests from hosts with
+ * unrestricted guest like Westmere to older host that don't have
+ * unrestricted guest like Nehelem.
+ */
+ if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+ switch (seg) {
+ case VCPU_SREG_CS:
+ vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
+ vmcs_write32(GUEST_CS_LIMIT, 0xffff);
+ if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
+ vmcs_writel(GUEST_CS_BASE, 0xf0000);
+ vmcs_write16(GUEST_CS_SELECTOR,
+ vmcs_readl(GUEST_CS_BASE) >> 4);
+ break;
+ case VCPU_SREG_ES:
+ fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
+ break;
+ case VCPU_SREG_DS:
+ fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
+ break;
+ case VCPU_SREG_GS:
+ fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
+ break;
+ case VCPU_SREG_FS:
+ fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
+ break;
+ case VCPU_SREG_SS:
+ vmcs_write16(GUEST_SS_SELECTOR,
+ vmcs_readl(GUEST_SS_BASE) >> 4);
+ vmcs_write32(GUEST_SS_LIMIT, 0xffff);
+ vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
+ break;
+ }
+ }
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3633,8 +3742,18 @@ static void vmx_set_constant_host_state(void)
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
+#ifdef CONFIG_X86_64
+ /*
+ * Load null selectors, so we can avoid reloading them in
+ * __vmx_load_host_state(), in case userspace uses the null selectors
+ * too (the expected case).
+ */
+ vmcs_write16(HOST_DS_SELECTOR, 0);
+ vmcs_write16(HOST_ES_SELECTOR, 0);
+#else
vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
+#endif
vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
@@ -3693,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
if (!enable_ept) {
exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
enable_unrestricted_guest = 0;
+ /* Enable INVPCID for non-ept guests may cause performance regression. */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
}
if (!enable_unrestricted_guest)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -4451,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
break;
}
vcpu->run->exit_reason = 0;
- pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
+ vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
(int)(exit_qualification >> 4) & 3, cr);
return 0;
}
@@ -4731,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
+ u32 error_code;
int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4755,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+ /* It is a write fault? */
+ error_code = exit_qualification & (1U << 1);
+ /* ept page table is present? */
+ error_code |= (exit_qualification >> 3) & 0x1;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -4870,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
int ret = 1;
u32 cpu_exec_ctrl;
bool intr_window_requested;
+ unsigned count = 130;
cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
- while (!guest_state_valid(vcpu)) {
- if (intr_window_requested
- && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
+ while (!guest_state_valid(vcpu) && count-- != 0) {
+ if (intr_window_requested && vmx_interrupt_allowed(vcpu))
return handle_interrupt_window(&vmx->vcpu);
+ if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
+ return 1;
+
err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
@@ -4886,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE)
+ if (err != EMULATE_DONE) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
return 0;
+ }
if (signal_pending(current))
goto out;
@@ -4895,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
schedule();
}
- vmx->emulation_required = 0;
+ vmx->emulation_required = !guest_state_valid(vcpu);
out:
return ret;
}
@@ -6256,7 +6391,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
}
}
- asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6343,7 +6477,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
return &vmx->vcpu;
free_vmcs:
- free_vmcs(vmx->loaded_vmcs->vmcs);
+ free_loaded_vmcs(vmx->loaded_vmcs);
free_msrs:
kfree(vmx->guest_msrs);
uninit_vcpu:
@@ -6430,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
}
}
}
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ /* Exposing INVPCID only when PCID is exposed */
+ best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ if (vmx_invpcid_supported() &&
+ best && (best->ecx & bit(X86_FEATURE_INVPCID)) &&
+ guest_cpuid_has_pcid(vcpu)) {
+ exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ } else {
+ exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+ exec_control);
+ if (best)
+ best->ecx &= ~bit(X86_FEATURE_INVPCID);
+ }
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -7164,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cpuid_update = vmx_cpuid_update,
.rdtscp_supported = vmx_rdtscp_supported,
+ .invpcid_supported = vmx_invpcid_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
@@ -7193,23 +7345,21 @@ static int __init vmx_init(void)
if (!vmx_io_bitmap_a)
return -ENOMEM;
+ r = -ENOMEM;
+
vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b) {
- r = -ENOMEM;
+ if (!vmx_io_bitmap_b)
goto out;
- }
vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy) {
- r = -ENOMEM;
+ if (!vmx_msr_bitmap_legacy)
goto out1;
- }
+
vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode) {
- r = -ENOMEM;
+ if (!vmx_msr_bitmap_longmode)
goto out2;
- }
+
/*
* Allow direct access to the PC debug port (it is often used for I/O
@@ -7238,8 +7388,10 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
+ kvm_mmu_set_mask_ptes(0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185a2b823a2d..59b59508ff07 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
return 1;
}
+ if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
+ return 1;
+
kvm_x86_ops->set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
@@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_read_cr3(vcpu)))
return 1;
+ if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
+ if (!guest_cpuid_has_pcid(vcpu))
+ return 1;
+
+ /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ return 1;
+ }
+
if (kvm_x86_ops->set_cr4(vcpu, cr4))
return 1;
- if ((cr4 ^ old_cr4) & pdptr_bits)
+ if (((cr4 ^ old_cr4) & pdptr_bits) ||
+ (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
@@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS)
- return 1;
+ if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+ if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
+ return 1;
+ } else
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS)
@@ -795,6 +812,7 @@ static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
}
default:
- pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
+ vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
return 1;
}
return 0;
@@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
default:
- pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
+ vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
+ "data 0x%llx\n", msr, data);
return 1;
}
@@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
if (data != 0) {
- pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
+ vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
+ data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
- pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
+ vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
+ "0x%llx\n", data);
return 1;
}
break;
@@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
thus reserved and should throw a #GP */
return 1;
}
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
@@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
break;
+ case MSR_KVM_PV_EOI_EN:
+ if (kvm_lapic_enable_pv_eoi(vcpu, data))
+ return 1;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
@@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_K7_EVNTSEL2:
case MSR_K7_EVNTSEL3:
if (data != 0)
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
break;
/* at least RHEL 4 unconditionally writes to the perfctr registers,
* so we ignore writes to make it happy.
@@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
case MSR_K7_PERFCTR3:
- pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
break;
case MSR_P6_PERFCTR0:
case MSR_P6_PERFCTR1:
@@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return kvm_pmu_set_msr(vcpu, msr, data);
if (pr || data != 0)
- pr_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
+ "0x%x data 0x%llx\n", msr, data);
break;
case MSR_K7_CLK_CTL:
/*
@@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
+ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has_osvw(vcpu))
@@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (kvm_pmu_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr, data);
if (!ignore_msrs) {
- pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
- msr, data);
+ vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
+ msr, data);
return 1;
} else {
- pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
- msr, data);
+ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
+ msr, data);
break;
}
}
@@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = kvm->arch.hv_hypercall;
break;
default:
- pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
}
@@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = vcpu->arch.hv_vapic;
break;
default:
- pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
}
*pdata = data;
@@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
if (kvm_pmu_msr(vcpu, msr))
return kvm_pmu_get_msr(vcpu, msr, pdata);
if (!ignore_msrs) {
- pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
} else {
- pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
data = 0;
}
break;
@@ -2147,6 +2169,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_ASYNC_PF:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_PCI_2_3:
+ case KVM_CAP_KVMCLOCK_CTRL:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2597,6 +2620,23 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
return r;
}
+/*
+ * kvm_set_guest_paused() indicates to the guest kernel that it has been
+ * stopped by the hypervisor. This function will be called from the host only.
+ * EINVAL is returned when the host attempts to set the flag for a guest that
+ * does not support pv clocks.
+ */
+static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
+{
+ struct pvclock_vcpu_time_info *src = &vcpu->arch.hv_clock;
+ if (!vcpu->arch.time_page)
+ return -EINVAL;
+ src->flags |= PVCLOCK_GUEST_STOPPED;
+ mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ return 0;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -2873,6 +2913,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = vcpu->arch.virtual_tsc_khz;
goto out;
}
+ case KVM_KVMCLOCK_CTRL: {
+ r = kvm_set_guest_paused(vcpu);
+ goto out;
+ }
default:
r = -EINVAL;
}
@@ -3045,57 +3089,32 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
}
/**
- * write_protect_slot - write protect a slot for dirty logging
- * @kvm: the kvm instance
- * @memslot: the slot we protect
- * @dirty_bitmap: the bitmap indicating which pages are dirty
- * @nr_dirty_pages: the number of dirty pages
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
*
- * We have two ways to find all sptes to protect:
- * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
- * checks ones that have a spte mapping a page in the slot.
- * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ * We need to keep it in mind that VCPU threads can write to the bitmap
+ * concurrently. So, to avoid losing data, we keep the following order for
+ * each bit:
*
- * Generally speaking, if there are not so many dirty pages compared to the
- * number of shadow pages, we should use the latter.
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Flush TLB's if needed.
+ * 4. Copy the snapshot to the userspace.
*
- * Note that letting others write into a page marked dirty in the old bitmap
- * by using the remaining tlb entry is not a problem. That page will become
- * write protected again when we flush the tlb and then be reported dirty to
- * the user space by copying the old bitmap.
- */
-static void write_protect_slot(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
- unsigned long *dirty_bitmap,
- unsigned long nr_dirty_pages)
-{
- spin_lock(&kvm->mmu_lock);
-
- /* Not many dirty pages compared to # of shadow pages. */
- if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
- unsigned long gfn_offset;
-
- for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
- unsigned long gfn = memslot->base_gfn + gfn_offset;
-
- kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
- }
- kvm_flush_remote_tlbs(kvm);
- } else
- kvm_mmu_slot_remove_write_access(kvm, memslot->id);
-
- spin_unlock(&kvm->mmu_lock);
-}
-
-/*
- * Get (and clear) the dirty memory log for a memory slot.
+ * Between 2 and 3, the guest may write to the page using the remaining TLB
+ * entry. This is not a problem because the page will be reported dirty at
+ * step 4 using the snapshot taken before and step 3 ensures that successive
+ * writes will be logged for the next call.
*/
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
- struct kvm_dirty_log *log)
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
int r;
struct kvm_memory_slot *memslot;
- unsigned long n, nr_dirty_pages;
+ unsigned long n, i;
+ unsigned long *dirty_bitmap;
+ unsigned long *dirty_bitmap_buffer;
+ bool is_dirty = false;
mutex_lock(&kvm->slots_lock);
@@ -3104,49 +3123,42 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
goto out;
memslot = id_to_memslot(kvm->memslots, log->slot);
+
+ dirty_bitmap = memslot->dirty_bitmap;
r = -ENOENT;
- if (!memslot->dirty_bitmap)
+ if (!dirty_bitmap)
goto out;
n = kvm_dirty_bitmap_bytes(memslot);
- nr_dirty_pages = memslot->nr_dirty_pages;
- /* If nothing is dirty, don't bother messing with page tables. */
- if (nr_dirty_pages) {
- struct kvm_memslots *slots, *old_slots;
- unsigned long *dirty_bitmap, *dirty_bitmap_head;
+ dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+ memset(dirty_bitmap_buffer, 0, n);
- dirty_bitmap = memslot->dirty_bitmap;
- dirty_bitmap_head = memslot->dirty_bitmap_head;
- if (dirty_bitmap == dirty_bitmap_head)
- dirty_bitmap_head += n / sizeof(long);
- memset(dirty_bitmap_head, 0, n);
+ spin_lock(&kvm->mmu_lock);
- r = -ENOMEM;
- slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
- if (!slots)
- goto out;
+ for (i = 0; i < n / sizeof(long); i++) {
+ unsigned long mask;
+ gfn_t offset;
- memslot = id_to_memslot(slots, log->slot);
- memslot->nr_dirty_pages = 0;
- memslot->dirty_bitmap = dirty_bitmap_head;
- update_memslots(slots, NULL);
+ if (!dirty_bitmap[i])
+ continue;
- old_slots = kvm->memslots;
- rcu_assign_pointer(kvm->memslots, slots);
- synchronize_srcu_expedited(&kvm->srcu);
- kfree(old_slots);
+ is_dirty = true;
- write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
+ mask = xchg(&dirty_bitmap[i], 0);
+ dirty_bitmap_buffer[i] = mask;
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- goto out;
- } else {
- r = -EFAULT;
- if (clear_user(log->dirty_bitmap, n))
- goto out;
+ offset = i * BITS_PER_LONG;
+ kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
}
+ if (is_dirty)
+ kvm_flush_remote_tlbs(kvm);
+
+ spin_unlock(&kvm->mmu_lock);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+ goto out;
r = 0;
out:
@@ -3728,9 +3740,8 @@ struct read_write_emulator_ops {
static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
{
if (vcpu->mmio_read_completed) {
- memcpy(val, vcpu->mmio_data, bytes);
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
- vcpu->mmio_phys_addr, *(u64 *)val);
+ vcpu->mmio_fragments[0].gpa, *(u64 *)val);
vcpu->mmio_read_completed = 0;
return 1;
}
@@ -3766,8 +3777,9 @@ static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- memcpy(vcpu->mmio_data, val, bytes);
- memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
+
+ memcpy(vcpu->run->mmio.data, frag->data, frag->len);
return X86EMUL_CONTINUE;
}
@@ -3794,10 +3806,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
gpa_t gpa;
int handled, ret;
bool write = ops->write;
-
- if (ops->read_write_prepare &&
- ops->read_write_prepare(vcpu, val, bytes))
- return X86EMUL_CONTINUE;
+ struct kvm_mmio_fragment *frag;
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -3823,15 +3832,19 @@ mmio:
bytes -= handled;
val += handled;
- vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
- vcpu->mmio_index = 0;
+ while (bytes) {
+ unsigned now = min(bytes, 8U);
- return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = now;
+
+ gpa += now;
+ val += now;
+ bytes -= now;
+ }
+ return X86EMUL_CONTINUE;
}
int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
@@ -3840,10 +3853,18 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ gpa_t gpa;
+ int rc;
+
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ vcpu->mmio_nr_fragments = 0;
/* Crossing a page boundary? */
if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
- int rc, now;
+ int now;
now = -addr & ~PAGE_MASK;
rc = emulator_read_write_onepage(addr, val, now, exception,
@@ -3856,8 +3877,25 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
bytes -= now;
}
- return emulator_read_write_onepage(addr, val, bytes, exception,
- vcpu, ops);
+ rc = emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (!vcpu->mmio_nr_fragments)
+ return rc;
+
+ gpa = vcpu->mmio_fragments[0].gpa;
+
+ vcpu->mmio_needed = 1;
+ vcpu->mmio_cur_fragment = 0;
+
+ vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = gpa;
+
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
}
static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -4100,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
value = kvm_get_cr8(vcpu);
break;
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
return 0;
}
@@ -4129,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
res = kvm_set_cr8(vcpu, val);
break;
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ kvm_err("%s: unexpected cr %u\n", __func__, cr);
res = -1;
}
@@ -4281,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
}
-static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
+static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
{
- struct kvm_cpuid_entry2 *cpuid = NULL;
-
- if (eax && ecx)
- cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt),
- *eax, *ecx);
-
- if (cpuid) {
- *eax = cpuid->eax;
- *ecx = cpuid->ecx;
- if (ebx)
- *ebx = cpuid->ebx;
- if (edx)
- *edx = cpuid->edx;
- return true;
- }
-
- return false;
+ kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
}
static struct x86_emulate_ops emulate_ops = {
@@ -5263,10 +5285,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_deliver_pmi(vcpu);
}
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
inject_pending_event(vcpu);
@@ -5282,6 +5300,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
}
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r)) {
+ goto cancel_injection;
+ }
+
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5304,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_wmb();
local_irq_enable();
preempt_enable();
- kvm_x86_ops->cancel_injection(vcpu);
r = 1;
- goto out;
+ goto cancel_injection;
}
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -5370,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (unlikely(vcpu->arch.tsc_always_catchup))
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- kvm_lapic_sync_from_vapic(vcpu);
+ if (vcpu->arch.apic_attention)
+ kvm_lapic_sync_from_vapic(vcpu);
r = kvm_x86_ops->handle_exit(vcpu);
+ return r;
+
+cancel_injection:
+ kvm_x86_ops->cancel_injection(vcpu);
+ if (unlikely(vcpu->arch.apic_attention))
+ kvm_lapic_sync_from_vapic(vcpu);
out:
return r;
}
@@ -5456,33 +5485,55 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+/*
+ * Implements the following, as a state machine:
+ *
+ * read:
+ * for each fragment
+ * write gpa, len
+ * exit
+ * copy data
+ * execute insn
+ *
+ * write:
+ * for each fragment
+ * write gpa, len
+ * copy data
+ * exit
+ */
static int complete_mmio(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
+ struct kvm_mmio_fragment *frag;
int r;
if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
return 1;
if (vcpu->mmio_needed) {
- vcpu->mmio_needed = 0;
+ /* Complete previous fragment */
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
if (!vcpu->mmio_is_write)
- memcpy(vcpu->mmio_data + vcpu->mmio_index,
- run->mmio.data, 8);
- vcpu->mmio_index += 8;
- if (vcpu->mmio_index < vcpu->mmio_size) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
- memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
- run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
- run->mmio.is_write = vcpu->mmio_is_write;
- vcpu->mmio_needed = 1;
- return 0;
+ memcpy(frag->data, run->mmio.data, frag->len);
+ if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
+ vcpu->mmio_needed = 0;
+ if (vcpu->mmio_is_write)
+ return 1;
+ vcpu->mmio_read_completed = 1;
+ goto done;
}
+ /* Initiate next fragment */
+ ++frag;
+ run->exit_reason = KVM_EXIT_MMIO;
+ run->mmio.phys_addr = frag->gpa;
if (vcpu->mmio_is_write)
- return 1;
- vcpu->mmio_read_completed = 1;
+ memcpy(run->mmio.data, frag->data, frag->len);
+ run->mmio.len = frag->len;
+ run->mmio.is_write = vcpu->mmio_is_write;
+ return 0;
+
}
+done:
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
@@ -6264,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
- vfree(free->arch.lpage_info[i]);
+ kvm_kvfree(free->arch.lpage_info[i]);
free->arch.lpage_info[i] = NULL;
}
}
@@ -6283,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
slot->base_gfn, level) + 1;
slot->arch.lpage_info[i] =
- vzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
if (!slot->arch.lpage_info[i])
goto out_free;
@@ -6310,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
- vfree(slot->arch.lpage_info[i]);
+ kvm_kvfree(slot->arch.lpage_info[i]);
slot->arch.lpage_info[i] = NULL;
}
return -ENOMEM;
@@ -6399,21 +6450,9 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
kvm_cpu_has_interrupt(vcpu));
}
-void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
- int me;
- int cpu = vcpu->cpu;
-
- if (waitqueue_active(&vcpu->wq)) {
- wake_up_interruptible(&vcpu->wq);
- ++vcpu->stat.halt_wakeup;
- }
-
- me = get_cpu();
- if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
- smp_send_reschedule(cpu);
- put_cpu();
+ return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cb80c293cdd8..3d1134ddb885 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -64,7 +64,7 @@ static inline int is_pse(struct kvm_vcpu *vcpu)
static inline int is_paging(struct kvm_vcpu *vcpu)
{
- return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
+ return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
}
static inline u32 bit(int bitno)
OpenPOWER on IntegriCloud