summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/emulate.c749
-rw-r--r--arch/x86/kvm/i8254.c146
-rw-r--r--arch/x86/kvm/i8254.h4
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h8
-rw-r--r--arch/x86/kvm/lapic.c17
-rw-r--r--arch/x86/kvm/mmu.c807
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/paging_tmpl.h252
-rw-r--r--arch/x86/kvm/svm.c147
-rw-r--r--arch/x86/kvm/timer.c16
-rw-r--r--arch/x86/kvm/vmx.c261
-rw-r--r--arch/x86/kvm/x86.c1176
-rw-r--r--arch/x86/kvm/x86.h7
16 files changed, 2173 insertions, 1473 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
* privileged instructions:
*
* Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */
#define SrcImmU (9<<4) /* Immediate operand, unsigned */
#define SrcSI (0xa<<4) /* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */
+#define SrcAcc (0xd<<4) /* Source Accumulator */
#define SrcMask (0xf<<4)
/* Generic ModRM decode. */
#define ModRM (1<<8)
@@ -88,10 +92,6 @@
#define Src2CL (1<<29)
#define Src2ImmByte (2<<29)
#define Src2One (3<<29)
-#define Src2Imm16 (4<<29)
-#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be
- in memory and second argument is located
- immediately after the first one in memory. */
#define Src2Mask (7<<29)
enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x28 - 0x2F */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x30 - 0x37 */
ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
/* 0x38 - 0x3F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
- DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+ DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
+ ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
/* 0x90 - 0x97 */
DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
/* 0x98 - 0x9F */
- 0, 0, SrcImm | Src2Imm16 | No64, 0,
+ 0, 0, SrcImmFAddr | No64, 0,
ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
/* 0xA0 - 0xA7 */
- ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
- ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+ ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+ ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
/* 0xA8 - 0xAF */
- 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+ DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
ByteOp | DstDI | String, DstDI | String,
/* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
/* 0xE8 - 0xEF */
SrcImm | Stack, SrcImm | ImplicitOps,
- SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+ SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
/* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
[Group1A*8] =
DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
[Group3_Byte*8] =
- ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group3*8] =
- DstMem | SrcImm | ModRM, 0,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
0, 0, 0, 0,
[Group4*8] =
- ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
0, 0, 0, 0, 0, 0,
[Group5*8] =
- DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
SrcMem | ModRM | Stack, 0,
- SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+ SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
SrcMem | ModRM | Stack, 0,
[Group7*8] =
0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
(_type)_x; \
})
+#define insn_fetch_arr(_arr, _size, _eip) \
+({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
+ if (rc != X86EMUL_CONTINUE) \
+ goto done; \
+ (_eip) += (_size); \
+})
+
static inline unsigned long ad_mask(struct decode_cache *c)
{
return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
c->seg_override = seg;
}
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
return 0;
- return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+ return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, c->seg_override);
+ return seg_base(ctxt, ops, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ return seg_base(ctxt, ops, VCPU_SREG_SS);
+}
+
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
+{
+ ctxt->exception = vec;
+ ctxt->error_code = error;
+ ctxt->error_code_valid = valid;
+ ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+ emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ int err)
{
- return seg_base(ctxt, VCPU_SREG_ES);
+ ctxt->cr2 = addr;
+ emulate_exception(ctxt, PF_VECTOR, err, true);
}
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
{
- return seg_base(ctxt, VCPU_SREG_SS);
+ emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+ emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* we cannot decode insn before we complete previous rep insn */
WARN_ON(ctxt->restart);
- /* Shadow copy of register state. Committed on successful emulation. */
- memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->eip;
c->fetch.start = c->fetch.end = c->eip;
- ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
set_seg_override(c, VCPU_SREG_DS);
if (!(!c->twobyte && c->b == 0x8d))
- c->modrm_ea += seg_override_base(ctxt, c);
+ c->modrm_ea += seg_override_base(ctxt, ops, c);
if (c->ad_bytes != 8)
c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
else
c->src.val = insn_fetch(u8, 1, c->eip);
break;
+ case SrcAcc:
+ c->src.type = OP_REG;
+ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+ c->src.ptr = &c->regs[VCPU_REGS_RAX];
+ switch (c->src.bytes) {
+ case 1:
+ c->src.val = *(u8 *)c->src.ptr;
+ break;
+ case 2:
+ c->src.val = *(u16 *)c->src.ptr;
+ break;
+ case 4:
+ c->src.val = *(u32 *)c->src.ptr;
+ break;
+ case 8:
+ c->src.val = *(u64 *)c->src.ptr;
+ break;
+ }
+ break;
case SrcOne:
c->src.bytes = 1;
c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->src.ptr = (unsigned long *)
- register_address(c, seg_override_base(ctxt, c),
+ register_address(c, seg_override_base(ctxt, ops, c),
c->regs[VCPU_REGS_RSI]);
c->src.val = 0;
break;
+ case SrcImmFAddr:
+ c->src.type = OP_IMM;
+ c->src.ptr = (unsigned long *)c->eip;
+ c->src.bytes = c->op_bytes + 2;
+ insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+ break;
+ case SrcMemFAddr:
+ c->src.type = OP_MEM;
+ c->src.ptr = (unsigned long *)c->modrm_ea;
+ c->src.bytes = c->op_bytes + 2;
+ break;
}
/*
@@ -1179,22 +1248,10 @@ done_prefixes:
c->src2.bytes = 1;
c->src2.val = insn_fetch(u8, 1, c->eip);
break;
- case Src2Imm16:
- c->src2.type = OP_IMM;
- c->src2.ptr = (unsigned long *)c->eip;
- c->src2.bytes = 2;
- c->src2.val = insn_fetch(u16, 2, c->eip);
- break;
case Src2One:
c->src2.bytes = 1;
c->src2.val = 1;
break;
- case Src2Mem16:
- c->src2.type = OP_MEM;
- c->src2.bytes = 2;
- c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
- c->src2.val = 0;
- break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)
- register_address(c, es_base(ctxt),
+ register_address(c, es_base(ctxt, ops),
c->regs[VCPU_REGS_RDI]);
c->dst.val = 0;
break;
@@ -1263,6 +1320,37 @@ done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ unsigned long addr, void *dest, unsigned size)
+{
+ int rc;
+ struct read_cache *mc = &ctxt->decode.mem_read;
+ u32 err;
+
+ while (size) {
+ int n = min(size, 8u);
+ size -= n;
+ if (mc->pos < mc->end)
+ goto read_cached;
+
+ rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt, addr, err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ mc->end += n;
+
+ read_cached:
+ memcpy(dest, mc->data + mc->pos, n);
+ mc->pos += n;
+ dest += n;
+ addr += n;
+ }
+ return X86EMUL_CONTINUE;
+}
+
static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt, addr, err);
return ret;
}
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
if (dt.size < index * 8 + 7) {
- kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+ emulate_gp(ctxt, selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
addr = dt.address + index * 8;
ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT)
- kvm_inject_page_fault(ctxt->vcpu, addr, err);
+ emulate_pf(ctxt, addr, err);
return ret;
}
@@ -1481,11 +1569,70 @@ load:
ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
return X86EMUL_CONTINUE;
exception:
- kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+ emulate_exception(ctxt, err_vec, err_code, true);
return X86EMUL_PROPAGATE_FAULT;
}
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ int rc;
+ struct decode_cache *c = &ctxt->decode;
+ u32 err;
+
+ switch (c->dst.type) {
+ case OP_REG:
+ /* The 4-byte case *is* correct:
+ * in 64-bit mode we zero-extend.
+ */
+ switch (c->dst.bytes) {
+ case 1:
+ *(u8 *)c->dst.ptr = (u8)c->dst.val;
+ break;
+ case 2:
+ *(u16 *)c->dst.ptr = (u16)c->dst.val;
+ break;
+ case 4:
+ *c->dst.ptr = (u32)c->dst.val;
+ break; /* 64b: zero-ext */
+ case 8:
+ *c->dst.ptr = c->dst.val;
+ break;
+ }
+ break;
+ case OP_MEM:
+ if (c->lock_prefix)
+ rc = ops->cmpxchg_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.orig_val,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ else
+ rc = ops->write_emulated(
+ (unsigned long)c->dst.ptr,
+ &c->dst.val,
+ c->dst.bytes,
+ &err,
+ ctxt->vcpu);
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ emulate_pf(ctxt,
+ (unsigned long)c->dst.ptr, err);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ break;
+ case OP_NONE:
+ /* no writeback */
+ break;
+ default:
+ break;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+ c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
c->regs[VCPU_REGS_RSP]);
}
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_emulated(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- dest, len, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+ c->regs[VCPU_REGS_RSP]),
+ dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
break;
case X86EMUL_MODE_VM86:
if (iopl < 3) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops, int seg)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment segment;
- kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+ c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
- c->src.val = segment.selector;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+ int rc = X86EMUL_CONTINUE;
int reg = VCPU_REGS_RAX;
while (reg <= VCPU_REGS_RDI) {
(reg == VCPU_REGS_RSP) ?
(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
+
+ rc = writeback(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
++reg;
}
+
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
+
+ return rc;
}
static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
old_eip = c->eip;
c->eip = c->src.val;
c->src.val = old_eip;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 4: /* jmp abs */
c->eip = c->src.val;
break;
case 6: /* push */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
return rc;
}
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- int rc;
- struct decode_cache *c = &ctxt->decode;
-
- switch (c->dst.type) {
- case OP_REG:
- /* The 4-byte case *is* correct:
- * in 64-bit mode we zero-extend.
- */
- switch (c->dst.bytes) {
- case 1:
- *(u8 *)c->dst.ptr = (u8)c->dst.val;
- break;
- case 2:
- *(u16 *)c->dst.ptr = (u16)c->dst.val;
- break;
- case 4:
- *c->dst.ptr = (u32)c->dst.val;
- break; /* 64b: zero-ext */
- case 8:
- *c->dst.ptr = c->dst.val;
- break;
- }
- break;
- case OP_MEM:
- if (c->lock_prefix)
- rc = ops->cmpxchg_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.orig_val,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- else
- rc = ops->write_emulated(
- (unsigned long)c->dst.ptr,
- &c->dst.val,
- c->dst.bytes,
- ctxt->vcpu);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- break;
- case OP_NONE:
- /* no writeback */
- break;
- default:
- break;
- }
- return X86EMUL_CONTINUE;
-}
-
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
- /*
- * an sti; sti; sequence only disable interrupts for the first
- * instruction. So, if the last instruction, be it emulated or
- * not, left the system with the INT_STI flag enabled, it
- * means that the last instruction is an sti. We should not
- * leave the flag on in this case. The same goes for mov ss
- */
- if (!(int_shadow & mask))
- ctxt->interruptibility = mask;
-}
-
static inline void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
- struct kvm_segment *cs, struct kvm_segment *ss)
+ struct x86_emulate_ops *ops, struct desc_struct *cs,
+ struct desc_struct *ss)
{
- memset(cs, 0, sizeof(struct kvm_segment));
- kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct kvm_segment));
+ memset(cs, 0, sizeof(struct desc_struct));
+ ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+ memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */
- cs->base = 0; /* flat segment */
+ set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
- cs->limit = 0xffffffff; /* 4GB limit */
+ set_desc_limit(cs, 0xfffff); /* 4GB limit */
cs->type = 0x0b; /* Read, Execute, Accessed */
cs->s = 1;
cs->dpl = 0; /* will be adjusted later */
- cs->present = 1;
- cs->db = 1;
+ cs->p = 1;
+ cs->d = 1;
- ss->unusable = 0;
- ss->base = 0; /* flat segment */
- ss->limit = 0xffffffff; /* 4GB limit */
+ set_desc_base(ss, 0); /* flat segment */
+ set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->g = 1; /* 4kb granularity */
ss->s = 1;
ss->type = 0x03; /* Read/Write, Accessed */
- ss->db = 1; /* 32bit stack segment */
+ ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
- ss->present = 1;
+ ss->p = 1;
}
static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
msr_data >>= 32;
- cs.selector = (u16)(msr_data & 0xfffc);
- ss.selector = (u16)(msr_data + 8);
+ cs_sel = (u16)(msr_data & 0xfffc);
+ ss_sel = (u16)(msr_data + 8);
if (is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->regs[VCPU_REGS_RCX] = c->eip;
if (is_long_mode(ctxt->vcpu)) {
#ifdef CONFIG_X86_64
c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
- kvm_x86_ops->get_msr(ctxt->vcpu,
- ctxt->mode == X86EMUL_MODE_PROT64 ?
- MSR_LSTAR : MSR_CSTAR, &msr_data);
+ ops->get_msr(ctxt->vcpu,
+ ctxt->mode == X86EMUL_MODE_PROT64 ?
+ MSR_LSTAR : MSR_CSTAR, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~(msr_data | EFLG_RF);
#endif
} else {
/* legacy mode */
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
c->eip = (u32)msr_data;
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
}
static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
* Therefore, we inject an #UD.
*/
if (ctxt->mode == X86EMUL_MODE_PROT64) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
case X86EMUL_MODE_PROT64:
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
break;
}
ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
- cs.selector = (u16)msr_data;
- cs.selector &= ~SELECTOR_RPL_MASK;
- ss.selector = cs.selector + 8;
- ss.selector &= ~SELECTOR_RPL_MASK;
+ cs_sel = (u16)msr_data;
+ cs_sel &= ~SELECTOR_RPL_MASK;
+ ss_sel = cs_sel + 8;
+ ss_sel &= ~SELECTOR_RPL_MASK;
if (ctxt->mode == X86EMUL_MODE_PROT64
|| is_long_mode(ctxt->vcpu)) {
- cs.db = 0;
+ cs.d = 0;
cs.l = 1;
}
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
c->eip = msr_data;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
c->regs[VCPU_REGS_RSP] = msr_data;
return X86EMUL_CONTINUE;
}
static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- struct kvm_segment cs, ss;
+ struct desc_struct cs, ss;
u64 msr_data;
int usermode;
+ u16 cs_sel, ss_sel;
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
ctxt->mode == X86EMUL_MODE_VM86) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- setup_syscalls_segments(ctxt, &cs, &ss);
+ setup_syscalls_segments(ctxt, ops, &cs, &ss);
if ((c->rex_prefix & 0x8) != 0x0)
usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
cs.dpl = 3;
ss.dpl = 3;
- kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+ ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (usermode) {
case X86EMUL_MODE_PROT32:
- cs.selector = (u16)(msr_data + 16);
+ cs_sel = (u16)(msr_data + 16);
if ((msr_data & 0xfffc) == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = (u16)(msr_data + 24);
+ ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
- cs.selector = (u16)(msr_data + 32);
+ cs_sel = (u16)(msr_data + 32);
if (msr_data == 0x0) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
- ss.selector = cs.selector + 8;
- cs.db = 0;
+ ss_sel = cs_sel + 8;
+ cs.d = 0;
cs.l = 1;
break;
}
- cs.selector |= SELECTOR_RPL_MASK;
- ss.selector |= SELECTOR_RPL_MASK;
+ cs_sel |= SELECTOR_RPL_MASK;
+ ss_sel |= SELECTOR_RPL_MASK;
- kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
- kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+ ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+ ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+ ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
- c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
- c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+ c->eip = c->regs[VCPU_REGS_RDX];
+ c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
return X86EMUL_CONTINUE;
}
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
u16 port, u16 len)
{
- struct kvm_segment tr_seg;
+ struct desc_struct tr_seg;
int r;
u16 io_bitmap_ptr;
u8 perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1;
- kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
- if (tr_seg.unusable)
+ ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+ if (!tr_seg.p)
return false;
- if (tr_seg.limit < 103)
+ if (desc_limit_scaled(&tr_seg) < 103)
return false;
- r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
- NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+ ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
- if (io_bitmap_ptr + port/8 > tr_seg.limit)
+ if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false;
- r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
- ctxt->vcpu, NULL);
+ r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+ &perm, 1, ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE)
return false;
if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
return true;
}
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- int seg)
-{
- struct desc_struct desc;
- if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
- return get_desc_base(&desc);
- else
- return ~0;
-}
-
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
}
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- ops->set_cr(3, tss->cr3, ctxt->vcpu);
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ return X86EMUL_PROPAGATE_FAULT;
+ }
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+ emulate_pf(ctxt, old_tss_base, err);
return ret;
}
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
&err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ctxt->vcpu, &err);
if (ret == X86EMUL_PROPAGATE_FAULT) {
/* FIXME: need to provide precise fault address */
- kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+ emulate_pf(ctxt, new_tss_base, err);
return ret;
}
}
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
int ret;
u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
ulong old_tss_base =
- get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+ ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
u32 desc_limit;
/* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
return X86EMUL_PROPAGATE_FAULT;
}
}
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
desc_limit < 0x2b)) {
- kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
- tss_selector & 0xfffc);
+ emulate_ts(ctxt, tss_selector & 0xfffc);
return X86EMUL_PROPAGATE_FAULT;
}
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
c->lock_prefix = 0;
c->src.val = (unsigned long) error_code;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
}
return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- memset(c, 0, sizeof(struct decode_cache));
c->eip = ctxt->eip;
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
c->dst.type = OP_NONE;
rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
has_error_code, error_code);
if (rc == X86EMUL_CONTINUE) {
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
rc = writeback(ctxt, ops);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->eip = c->eip;
}
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
int rc = X86EMUL_CONTINUE;
int saved_dst_type = c->dst.type;
- ctxt->interruptibility = 0;
-
- /* Shadow copy of register state. Committed on successful emulation.
- * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
- * modify them.
- */
-
- memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+ ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
string_done:
ctxt->restart = false;
- kvm_rip_write(ctxt->vcpu, c->eip);
+ ctxt->eip = c->eip;
goto done;
}
/* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
}
if (c->src.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src.ptr,
- &c->src.val,
- c->src.bytes,
- ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+ c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
c->src.orig_val = c->src.val;
}
if (c->src2.type == OP_MEM) {
- rc = ops->read_emulated((unsigned long)c->src2.ptr,
- &c->src2.val,
- c->src2.bytes,
- ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+ &c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
+ rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+ &c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
}
@@ -2571,7 +2650,7 @@ special_insn:
emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
break;
case 0x06: /* push es */
- emulate_push_sreg(ctxt, VCPU_SREG_ES);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
break;
case 0x07: /* pop es */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
break;
case 0x0e: /* push cs */
- emulate_push_sreg(ctxt, VCPU_SREG_CS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
break;
case 0x10 ... 0x15:
adc: /* adc */
emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
break;
case 0x16: /* push ss */
- emulate_push_sreg(ctxt, VCPU_SREG_SS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
break;
case 0x17: /* pop ss */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
break;
case 0x1e: /* push ds */
- emulate_push_sreg(ctxt, VCPU_SREG_DS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
break;
case 0x1f: /* pop ds */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0x50 ... 0x57: /* push reg */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
goto done;
break;
case 0x60: /* pusha */
- emulate_pusha(ctxt);
+ rc = emulate_pusha(ctxt, ops);
+ if (rc != X86EMUL_CONTINUE)
+ goto done;
break;
case 0x61: /* popa */
rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
break;
case 0x68: /* push imm */
case 0x6a: /* push imm8 */
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x6c: /* insb */
case 0x6d: /* insw/insd */
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
c->src.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
}
break;
case 0x84 ... 0x85:
+ test:
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 0x86 ... 0x87: /* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
break;
case 0x88 ... 0x8b: /* mov */
goto mov;
- case 0x8c: { /* mov r/m, sreg */
- struct kvm_segment segreg;
-
- if (c->modrm_reg <= VCPU_SREG_GS)
- kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
- else {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ case 0x8c: /* mov r/m, sreg */
+ if (c->modrm_reg > VCPU_SREG_GS) {
+ emulate_ud(ctxt);
goto done;
}
- c->dst.val = segreg.selector;
+ c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
- }
case 0x8d: /* lea r16/r32, m */
c->dst.val = c->modrm_ea;
break;
@@ -2757,12 +2834,12 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
if (c->modrm_reg == VCPU_SREG_SS)
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
@@ -2775,19 +2852,19 @@ special_insn:
goto done;
break;
case 0x90: /* nop / xchg r8,rax */
- if (!(c->rex_prefix & 1)) { /* nop */
- c->dst.type = OP_NONE;
+ if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+ c->dst.type = OP_NONE; /* nop */
break;
}
case 0x91 ... 0x97: /* xchg reg,rax */
- c->src.type = c->dst.type = OP_REG;
- c->src.bytes = c->dst.bytes = c->op_bytes;
+ c->src.type = OP_REG;
+ c->src.bytes = c->op_bytes;
c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
c->src.val = *(c->src.ptr);
goto xchg;
case 0x9c: /* pushf */
c->src.val = (unsigned long) ctxt->eflags;
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
case 0x9d: /* popf */
c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
if (rc != X86EMUL_CONTINUE)
goto done;
break;
- case 0xa0 ... 0xa1: /* mov */
- c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- c->dst.val = c->src.val;
- break;
- case 0xa2 ... 0xa3: /* mov */
- c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
- break;
+ case 0xa0 ... 0xa3: /* mov */
case 0xa4 ... 0xa5: /* movs */
goto mov;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
goto cmp;
+ case 0xa8 ... 0xa9: /* test ax, imm */
+ goto test;
case 0xaa ... 0xab: /* stos */
c->dst.val = c->regs[VCPU_REGS_RAX];
break;
@@ -2855,19 +2928,23 @@ special_insn:
long int rel = c->src.val;
c->src.val = (unsigned long) c->eip;
jmp_rel(c, rel);
- emulate_push(ctxt);
+ emulate_push(ctxt, ops);
break;
}
case 0xe9: /* jmp rel */
goto jmp;
- case 0xea: /* jmp far */
+ case 0xea: { /* jmp far */
+ unsigned short sel;
jump_far:
- if (load_segment_descriptor(ctxt, ops, c->src2.val,
- VCPU_SREG_CS))
+ memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+ if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
goto done;
- c->eip = c->src.val;
+ c->eip = 0;
+ memcpy(&c->eip, c->src.valptr, c->op_bytes);
break;
+ }
case 0xeb:
jmp: /* jmp rel short */
jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
&c->dst.val))
goto done; /* IO is needed */
break;
- case 0xee: /* out al,dx */
- case 0xef: /* out (e/r)ax,dx */
+ case 0xee: /* out dx,al */
+ case 0xef: /* out dx,(e/r)ax */
c->src.val = c->regs[VCPU_REGS_RDX];
do_io_out:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xfa: /* cli */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else {
ctxt->eflags &= ~X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
break;
case 0xfb: /* sti */
- if (emulator_bad_iopl(ctxt, ops))
- kvm_inject_gp(ctxt->vcpu, 0);
- else {
- toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+ if (emulator_bad_iopl(ctxt, ops)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ } else {
+ ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
ctxt->eflags |= X86_EFLAGS_IF;
c->dst.type = OP_NONE; /* Disable writeback. */
}
@@ -2964,11 +3043,12 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
- &c->src);
+ string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+ string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ &c->dst);
if (c->rep_prefix && (c->d & String)) {
struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
(rc->end != 0 && rc->end == rc->pos))
ctxt->restart = false;
}
-
- /* Commit shadow register state. */
- memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
- kvm_rip_write(ctxt->vcpu, c->eip);
- ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+ /*
+ * reset read cache here in case string instruction is restared
+ * without decoding
+ */
+ ctxt->decode.mem_read.end = 0;
+ ctxt->eip = c->eip;
done:
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 5: /* not defined */
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
case 7: /* invlpg*/
emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
}
break;
case 0x05: /* syscall */
- rc = emulate_syscall(ctxt);
+ rc = emulate_syscall(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
@@ -3073,8 +3154,11 @@ twobyte_insn:
emulate_clts(ctxt->vcpu);
c->dst.type = OP_NONE;
break;
- case 0x08: /* invd */
case 0x09: /* wbinvd */
+ kvm_emulate_wbinvd(ctxt->vcpu);
+ c->dst.type = OP_NONE;
+ break;
+ case 0x08: /* invd */
case 0x0d: /* GrpP (prefetch) */
case 0x18: /* Grp16 (prefetch/nop) */
c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
case 1:
case 5 ... 7:
case 9 ... 15:
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
case 0x21: /* mov from dr to reg */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
goto done;
}
- emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+ ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x22: /* mov reg, cr */
- ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+ if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+ emulate_gp(ctxt, 0);
+ goto done;
+ }
c->dst.type = OP_NONE;
break;
case 0x23: /* mov from reg to dr */
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
- kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+ emulate_ud(ctxt);
+ goto done;
+ }
+
+ if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+ ((ctxt->mode == X86EMUL_MODE_PROT64) ?
+ ~0ULL : ~0U), ctxt->vcpu) < 0) {
+ /* #UD condition is already handled by the code above */
+ emulate_gp(ctxt, 0);
goto done;
}
- emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
c->dst.type = OP_NONE; /* no writeback */
break;
case 0x30:
/* wrmsr */
msr_data = (u32)c->regs[VCPU_REGS_RAX]
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
- if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
}
rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
break;
case 0x32:
/* rdmsr */
- if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
- kvm_inject_gp(ctxt->vcpu, 0);
+ if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+ emulate_gp(ctxt, 0);
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0x34: /* sysenter */
- rc = emulate_sysenter(ctxt);
+ rc = emulate_sysenter(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
goto writeback;
break;
case 0x35: /* sysexit */
- rc = emulate_sysexit(ctxt);
+ rc = emulate_sysexit(ctxt, ops);
if (rc != X86EMUL_CONTINUE)
goto done;
else
@@ -3160,7 +3255,7 @@ twobyte_insn:
c->dst.type = OP_NONE;
break;
case 0xa0: /* push fs */
- emulate_push_sreg(ctxt, VCPU_SREG_FS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
break;
case 0xa1: /* pop fs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
break;
case 0xa8: /* push gs */
- emulate_push_sreg(ctxt, VCPU_SREG_GS);
+ emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
break;
case 0xa9: /* pop gs */
rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
* Copyright (c) 2006 Intel Corporation
* Copyright (c) 2007 Keir Fraser, XenSource Inc
* Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
#include <linux/kvm_host.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "irq.h"
#include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
{
struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
irq_ack_notifier);
- raw_spin_lock(&ps->inject_lock);
- if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+ int value;
+
+ spin_lock(&ps->inject_lock);
+ value = atomic_dec_return(&ps->pit_timer.pending);
+ if (value < 0)
+ /* spurious acks can be generated if, for example, the
+ * PIC is being reset. Handle it gracefully here
+ */
atomic_inc(&ps->pit_timer.pending);
+ else if (value > 0)
+ /* in this case, we had multiple outstanding pit interrupts
+ * that we needed to inject. Reinject
+ */
+ queue_work(ps->pit->wq, &ps->pit->expired);
ps->irq_ack = 1;
- raw_spin_unlock(&ps->inject_lock);
+ spin_unlock(&ps->inject_lock);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
{
- pr_debug("execute del timer!\n");
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ cancel_work_sync(&pit->expired);
}
static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
.is_periodic = kpit_is_periodic,
};
+static void pit_do_work(struct work_struct *work)
+{
+ struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+ struct kvm *kvm = pit->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+ struct kvm_kpit_state *ps = &pit->pit_state;
+ int inject = 0;
+
+ /* Try to inject pending interrupts when
+ * last one has been acked.
+ */
+ spin_lock(&ps->inject_lock);
+ if (ps->irq_ack) {
+ ps->irq_ack = 0;
+ inject = 1;
+ }
+ spin_unlock(&ps->inject_lock);
+ if (inject) {
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+ kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+ if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ queue_work(pt->wq, &pt->expired);
+ }
+
+ if (ktimer->t_ops->is_periodic(ktimer)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
{
struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
/* TODO The new value only affected after the retriggered */
hrtimer_cancel(&pt->timer);
+ cancel_work_sync(&ps->pit->expired);
pt->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = kvm_timer_fn;
+ pt->timer.function = pit_timer_fn;
pt->t_ops = &kpit_ops;
pt->kvm = ps->pit->kvm;
- pt->vcpu = pt->kvm->bsp_vcpu;
atomic_set(&pt->pending, 0);
ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
}
break;
default:
- destroy_pit_timer(&ps->pit_timer);
+ destroy_pit_timer(kvm->arch.vpit);
}
}
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_init(&pit->pit_state.lock);
mutex_lock(&pit->pit_state.lock);
- raw_spin_lock_init(&pit->pit_state.inject_lock);
+ spin_lock_init(&pit->pit_state.inject_lock);
+
+ pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+ if (!pit->wq) {
+ mutex_unlock(&pit->pit_state.lock);
+ kfree(pit);
+ return NULL;
+ }
+ INIT_WORK(&pit->expired, pit_do_work);
kvm->arch.vpit = pit;
pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
struct hrtimer *timer;
if (kvm->arch.vpit) {
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &kvm->arch.vpit->speaker_dev);
kvm_unregister_irq_mask_notifier(kvm, 0,
&kvm->arch.vpit->mask_notifier);
kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
mutex_lock(&kvm->arch.vpit->pit_state.lock);
timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
hrtimer_cancel(timer);
+ cancel_work_sync(&kvm->arch.vpit->expired);
kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ destroy_workqueue(kvm->arch.vpit->wq);
kfree(kvm->arch.vpit);
}
}
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
- kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
- /*
- * Provides NMI watchdog support via Virtual Wire mode.
- * The route is: PIT -> PIC -> LVT0 in NMI mode.
- *
- * Note: Our Virtual Wire implementation is simplified, only
- * propagating PIT interrupts to all VCPUs when they have set
- * LVT0 to NMI delivery. Other PIC interrupts are just sent to
- * VCPU0, and only if its LVT0 is in EXTINT mode.
- */
- if (kvm->arch.vapics_in_nmi_mode > 0)
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
- struct kvm_pit *pit = vcpu->kvm->arch.vpit;
- struct kvm *kvm = vcpu->kvm;
- struct kvm_kpit_state *ps;
-
- if (pit) {
- int inject = 0;
- ps = &pit->pit_state;
-
- /* Try to inject pending interrupts when
- * last one has been acked.
- */
- raw_spin_lock(&ps->inject_lock);
- if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
- ps->irq_ack = 0;
- inject = 1;
- }
- raw_spin_unlock(&ps->inject_lock);
- if (inject)
- __inject_pit_timer_intr(kvm);
- }
-}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
- raw_spinlock_t inject_lock;
+ spinlock_t inject_lock;
unsigned long irq_ack;
struct kvm_irq_ack_notifier irq_ack_notifier;
};
@@ -40,6 +40,8 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
+ struct workqueue_struct *wq;
+ struct work_struct expired;
};
#define KVM_PIT_BASE_ADDRESS 0x40
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
*
* Copyright (c) 2003-2004 Fabrice Bellard
* Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
#include <linux/kvm_host.h>
#include "trace.h"
+static void pic_irq_request(struct kvm *kvm, int level);
+
static void pic_lock(struct kvm_pic *s)
__acquires(&s->lock)
{
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
__releases(&s->lock)
{
bool wakeup = s->wakeup_needed;
- struct kvm_vcpu *vcpu;
+ struct kvm_vcpu *vcpu, *found = NULL;
+ int i;
s->wakeup_needed = false;
raw_spin_unlock(&s->lock);
if (wakeup) {
- vcpu = s->kvm->bsp_vcpu;
- if (vcpu)
- kvm_vcpu_kick(vcpu);
+ kvm_for_each_vcpu(i, vcpu, s->kvm) {
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = vcpu;
+ break;
+ }
+ }
+
+ if (!found)
+ found = s->kvm->bsp_vcpu;
+
+ kvm_vcpu_kick(found);
}
}
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
pic_set_irq1(&s->pics[0], 2, 0);
}
irq = pic_get_irq(&s->pics[0]);
- if (irq >= 0)
- s->irq_request(s->irq_request_opaque, 1);
- else
- s->irq_request(s->irq_request_opaque, 0);
+ pic_irq_request(s->kvm, irq >= 0);
}
void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
int irq;
- struct kvm *kvm = s->pics_state->irq_request_opaque;
- struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+ struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
u8 irr = s->irr, isr = s->imr;
s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
/*
* deassert a pending interrupt
*/
- s->pics_state->irq_request(s->pics_state->
- irq_request_opaque, 0);
+ pic_irq_request(s->pics_state->kvm, 0);
s->init_state = 1;
s->init4 = val & 1;
if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
} else
switch (s->init_state) {
- case 0: /* normal mode */
+ case 0: { /* normal mode */
+ u8 imr_diff = s->imr ^ val,
+ off = (s == &s->pics_state->pics[0]) ? 0 : 8;
s->imr = val;
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (imr_diff & (1 << irq))
+ kvm_fire_mask_notifiers(
+ s->pics_state->kvm,
+ SELECT_PIC(irq + off),
+ irq + off,
+ !!(s->imr & (1 << irq)));
pic_update_irq(s->pics_state);
break;
+ }
case 1:
s->irq_base = val & 0xf8;
s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
/*
* callback when PIC0 irq status changed
*/
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
{
- struct kvm *kvm = opaque;
struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
- s->irq_request = pic_irq_request;
- s->irq_request_opaque = kvm;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
/*
* irq.c: API for in kernel interrupt controller
* Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
- kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
struct kvm;
struct kvm_vcpu;
-typedef void irq_request_func(void *opaque, int level);
-
struct kvm_kpic_state {
u8 last_irr; /* edge detection */
u8 irr; /* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
- irq_request_func *irq_request;
- void *irq_request_opaque;
int output; /* intr from master PIC */
struct kvm_io_device dev;
void (*ack_notifier)(void *opaque, int irq);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
{
+ might_sleep(); /* on svm */
+
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail))
kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
return kvm_read_cr4_bits(vcpu, ~0UL);
}
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+ return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+ | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2007 Novell
* Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
"dest_mode 0x%x, short_hand 0x%x\n",
target, source, dest, dest_mode, short_hand);
- ASSERT(!target);
+ ASSERT(target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
struct kvm_vcpu *vcpu = apic->vcpu;
struct kvm_run *run = vcpu->run;
- set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+ kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
run->tpr_access.rip = kvm_rip_read(vcpu);
run->tpr_access.is_write = write;
}
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
int r = 0;
- if (kvm_vcpu_is_bsp(vcpu)) {
- if (!apic_hw_enabled(vcpu->arch.apic))
- r = 1;
- if ((lvt0 & APIC_LVT_MASKED) == 0 &&
- GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- }
+ if (!apic_hw_enabled(vcpu->arch.apic))
+ r = 1;
+ if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+ GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+ r = 1;
return r;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..311f6dad8951 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/srcu.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
#define PT64_LEVEL_BITS 9
#define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte)
static void __set_spte(u64 *sptep, u64 spte)
{
+ set_64bit(sptep, spte);
+}
+
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
#ifdef CONFIG_X86_64
- set_64bit((unsigned long *)sptep, spte);
+ return xchg(sptep, new_spte);
#else
- set_64bit((unsigned long long *)sptep, spte);
+ u64 old_spte;
+
+ do {
+ old_spte = *sptep;
+ } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+ return old_spte;
#endif
}
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte;
+
+ if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+ !is_rmap_spte(*sptep))
+ __set_spte(sptep, new_spte);
+ else {
+ old_spte = __xchg_spte(sptep, new_spte);
+ if (old_spte & shadow_accessed_mask)
+ mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+ }
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+ struct kmem_cache *cache)
{
while (mc->nobjs)
- kfree(mc->objects[--mc->nobjs]);
+ kmem_cache_free(cache, mc->objects[--mc->nobjs]);
}
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +381,11 @@ out:
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
- mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+ mmu_page_header_cache);
}
static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
{
- kfree(pc);
+ kmem_cache_free(pte_chain_cache, pc);
}
static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
{
- kfree(rd);
+ kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+ if (!sp->role.direct)
+ return sp->gfns[index];
+
+ return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+ if (sp->role.direct)
+ BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+ else
+ sp->gfns[index] = gfn;
}
/*
@@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn,
{
unsigned long idx;
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].write_count;
}
@@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
-
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
int *write_count;
int i;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
write_count = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm,
struct kvm_memory_slot *slot;
int *largepage_idx;
- gfn = unalias_gfn(kvm, gfn);
- slot = gfn_to_memslot_unaliased(kvm, gfn);
+ slot = gfn_to_memslot(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot, level);
return *largepage_idx;
@@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
/*
* Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
- (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+ idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
return &slot->lpage_info[level - 2][idx].rmap_pde;
}
@@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
if (!is_rmap_spte(*spte))
return count;
- gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
- sp->gfns[spte - sp->spt] = gfn;
+ kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- pfn_t pfn;
+ gfn_t gfn;
unsigned long *rmapp;
int i;
- if (!is_rmap_spte(*spte))
- return;
sp = page_header(__pa(spte));
- pfn = spte_to_pfn(*spte);
- if (*spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(pfn);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+ gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+ rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
}
}
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+ old_spte & shadow_accessed_mask) {
+ __set_spte(sptep, new_spte);
+ } else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ if (!is_rmap_spte(old_spte))
+ return;
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (is_writable_pte(old_spte))
+ kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+ set_spte_track_bits(sptep, new_spte);
+ rmap_remove(kvm, sptep);
+}
+
static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
{
struct kvm_rmap_desc *desc;
@@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
u64 *spte;
int i, write_protected = 0;
- gfn = unalias_gfn(kvm, gfn);
rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ update_spte(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- rmap_remove(kvm, spte);
+ drop_spte(kvm, spte,
+ shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
- __set_spte(spte, shadow_trap_nonpresent_pte);
spte = NULL;
write_protected = 1;
}
@@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- rmap_remove(kvm, spte);
- __set_spte(spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- if (is_writable_pte(*spte))
- kvm_set_pfn_dirty(spte_to_pfn(*spte));
- __set_spte(spte, new_spte);
+ new_spte &= ~shadow_accessed_mask;
+ set_spte_track_bits(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- int idx = gfn_offset;
- idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+ unsigned long idx;
+ int sh;
+
+ sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+ idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+ (memslot->base_gfn >> sh);
ret |= handler(kvm,
&memslot->lpage_info[j][idx].rmap_pde,
data);
@@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- gfn = unalias_gfn(vcpu->kvm, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt)
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
+ hlist_del(&sp->hash_link);
list_del(&sp->link);
__free_page(virt_to_page(sp->spt));
- __free_page(virt_to_page(sp->gfns));
- kfree(sp);
+ if (!sp->role.direct)
+ __free_page(virt_to_page(sp->gfns));
+ kmem_cache_free(mmu_page_header_cache, sp);
++kvm->arch.n_free_mmu_pages;
}
@@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte)
+ u64 *parent_pte, int direct)
{
struct kvm_mmu_page *sp;
sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
- sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+ if (!direct)
+ sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+ PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
-
static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->multimapped && sp->parent_pte) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ fn(parent_sp, sp->parent_pte);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(parent_sp);
- mmu_parent_walk(parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ fn(parent_sp, spte);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
-
- index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
+ mmu_parent_walk(sp, mark_unsync);
}
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
+ unsigned int index;
- if (!sp->parent_pte)
+ index = spte - sp->spt;
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ if (sp->unsync_children++)
return;
- }
-
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
- return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
- mmu_parent_walk(sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+ struct kvm_mmu_page *sp, bool clear_unsync)
{
return 1;
}
@@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
-
- if (child->unsync_children) {
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
-
- ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- __clear_bit(i, sp->unsync_child_bitmap);
- else if (ret > 0)
- nr_unsync_leaf += ret;
- else
- return ret;
- }
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+ goto clear_child_bitmap;
+
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ goto clear_child_bitmap;
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
+ return ret;
+ } else if (child->unsync) {
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+ } else
+ goto clear_child_bitmap;
- if (child->unsync) {
- nr_unsync_leaf++;
- if (mmu_pages_add(pvec, child, i))
- return -ENOSPC;
- }
- }
+ continue;
+
+clear_child_bitmap:
+ __clear_bit(i, sp->unsync_child_bitmap);
+ sp->unsync_children--;
+ WARN_ON((int)sp->unsync_children < 0);
}
- if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
- sp->unsync_children = 0;
return nr_unsync_leaf;
}
@@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
return __mmu_unsync_walk(sp, pvec);
}
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
-
- pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: found role %x\n",
- __func__, sp->role.word);
- return sp;
- }
- return NULL;
-}
-
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
@@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list);
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+#define for_each_gfn_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \
+ hlist_for_each_entry(sp, pos, \
+ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \
+ if ((sp)->gfn != (gfn) || (sp)->role.direct || \
+ (sp)->role.invalid) {} else
+
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list, bool clear_unsync)
{
if (sp->role.cr4_pae != !!is_pae(vcpu)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
- if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
- kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (clear_unsync)
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+ if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
}
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ LIST_HEAD(invalid_list);
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+ if (ret)
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
+ bool flush = false;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!s->unsync)
+ continue;
+
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+ (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+ continue;
+ }
+ kvm_unlink_unsync_page(vcpu->kvm, s);
+ flush = true;
+ }
+
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ if (flush)
+ kvm_mmu_flush_tlb(vcpu);
+}
+
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
struct mmu_page_path parents;
struct kvm_mmu_pages pages;
+ LIST_HEAD(invalid_list);
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_flush_remote_tlbs(vcpu->kvm);
for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
+ kvm_sync_page(vcpu, sp, &invalid_list);
mmu_pages_clear_parents(&parents);
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
cond_resched_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
union kvm_mmu_page_role role;
- unsigned index;
unsigned quadrant;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *tmp;
+ struct hlist_node *node;
+ bool need_sync = false;
role = vcpu->arch.mmu.base_role;
role.level = level;
@@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (role.direct)
role.cr4_pae = 0;
role.access = access;
- if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+ if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
- if (sp->gfn == gfn) {
- if (sp->unsync)
- if (kvm_sync_page(vcpu, sp))
- continue;
+ for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+ if (!need_sync && sp->unsync)
+ need_sync = true;
- if (sp->role.word != role.word)
- continue;
+ if (sp->role.word != role.word)
+ continue;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
- set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(sp);
- }
- trace_kvm_mmu_get_page(sp, false);
- return sp;
- }
+ if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+ break;
+
+ mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_mmu_mark_parents_unsync(sp);
+ } else if (sp->unsync)
+ kvm_mmu_mark_parents_unsync(sp);
+
+ trace_kvm_mmu_get_page(sp, false);
+ return sp;
+ }
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+ sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
if (!sp)
return sp;
sp->gfn = gfn;
sp->role = role;
- hlist_add_head(&sp->hash_link, bucket);
+ hlist_add_head(&sp->hash_link,
+ &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
if (rmap_write_protect(vcpu->kvm, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
+ if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ kvm_sync_pages(vcpu, gfn);
+
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
--iterator->level;
}
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+ u64 spte;
+
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ __set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned direct_access)
+{
+ if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+ struct kvm_mmu_page *child;
+
+ /*
+ * For the direct sp, if the guest pte's dirty bit
+ * changed form clean to dirty, it will corrupt the
+ * sp's access: allow writable in the read-only sp,
+ * so we should update the spte at this point to get
+ * a new sp with the correct access.
+ */
+ child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+ if (child->role.access == direct_access)
+ return;
+
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
} else {
if (is_large_pte(ent))
--kvm->stat.lpages;
- rmap_remove(kvm, &pt[i]);
+ drop_spte(kvm, &pt[i],
+ shadow_trap_nonpresent_pte);
}
}
pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
static int mmu_zap_unsync_children(struct kvm *kvm,
- struct kvm_mmu_page *parent)
+ struct kvm_mmu_page *parent,
+ struct list_head *invalid_list)
{
int i, zapped = 0;
struct mmu_page_path parents;
@@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
struct kvm_mmu_page *sp;
for_each_sp(pages, sp, parents, i) {
- kvm_mmu_zap_page(kvm, sp);
+ kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
mmu_pages_clear_parents(&parents);
zapped++;
}
@@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
{
int ret;
- trace_kvm_mmu_zap_page(sp);
+ trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp);
+ ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
- kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- hlist_del(&sp->hash_link);
- kvm_mmu_free_page(kvm, sp);
+ /* Count self */
+ ret++;
+ list_move(&sp->link, invalid_list);
} else {
- sp->role.invalid = 1;
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
}
+
+ sp->role.invalid = 1;
kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+ struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ if (list_empty(invalid_list))
+ return;
+
+ kvm_flush_remote_tlbs(kvm);
+
+ do {
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_free_page(kvm, sp);
+ } while (!list_empty(invalid_list));
+
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
{
int used_pages;
+ LIST_HEAD(invalid_list);
used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
used_pages = max(0, used_pages);
@@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- used_pages -= kvm_mmu_zap_page(kvm, page);
- used_pages--;
+ used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+ &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
kvm_nr_mmu_pages = used_pages;
kvm->arch.n_free_mmu_pages = 0;
}
@@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
int r;
pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.direct) {
- pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
- sp->role.word);
- r = 1;
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+ sp->role.word);
+ r = 1;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
return r;
}
static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
{
- unsigned index;
- struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node, *nn;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
- index = kvm_page_table_hashfn(gfn);
- bucket = &kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
- if (sp->gfn == gfn && !sp->role.direct
- && !sp->role.invalid) {
- pgprintk("%s: zap %lx %x\n",
- __func__, gfn, sp->role.word);
- if (kvm_mmu_zap_page(kvm, sp))
- goto restart;
- }
+ for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+ pgprintk("%s: zap %lx %x\n",
+ __func__, gfn, sp->role.word);
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- unsigned index;
- struct hlist_head *bucket;
- struct kvm_mmu_page *s;
- struct hlist_node *node, *n;
-
- index = kvm_page_table_hashfn(sp->gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- /* don't unsync if pagetable is shadowed with multiple roles */
- hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
- if (s->gfn != sp->gfn || s->role.direct)
- continue;
- if (s->role.word != sp->role.word)
- return 1;
- }
trace_kvm_mmu_unsync_page(sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
kvm_mmu_mark_parents_unsync(sp);
-
mmu_convert_notrap(sp);
- return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (s->unsync)
+ continue;
+ WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ __kvm_unsync_page(vcpu, s);
+ }
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
- struct kvm_mmu_page *shadow;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node;
+ bool need_unsync = false;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
- if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+ if (!can_unsync)
return 1;
- if (shadow->unsync)
- return 0;
- if (can_unsync && oos_shadow)
- return kvm_unsync_page(vcpu, shadow);
- return 1;
+
+ if (s->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+
+ if (!need_unsync && !s->unsync) {
+ if (!oos_shadow)
+ return 1;
+ need_unsync = true;
+ }
}
+ if (need_unsync)
+ kvm_unsync_pages(vcpu, gfn);
return 0;
}
@@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
- || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+ || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+ && !user_fault)) {
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- spte = shadow_trap_nonpresent_pte;
- goto set_pte;
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ goto done;
}
spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- __set_spte(sptep, spte);
+ if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+ kvm_set_pfn_dirty(pfn);
+ update_spte(sptep, spte);
+done:
return ret;
}
@@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
bool reset_host_protection)
{
int was_rmapped = 0;
- int was_writable = is_writable_pte(*sptep);
int rmap_count;
pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %lx new %lx\n",
spte_to_pfn(*sptep), pfn);
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
reset_host_protection)) {
if (write_fault)
*ptwrite = 1;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
}
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
page_header_update_slot(vcpu->kvm, sptep, gfn);
if (!was_rmapped) {
rmap_count = rmap_add(vcpu, sptep, gfn);
- kvm_release_pfn_clean(pfn);
if (rmap_count > RMAP_RECYCLE_THRESHOLD)
rmap_recycle(vcpu, sptep, gfn);
- } else {
- if (was_writable)
- kvm_release_pfn_dirty(pfn);
- else
- kvm_release_pfn_clean(pfn);
}
+ kvm_release_pfn_clean(pfn);
if (speculative) {
vcpu->arch.last_pte_updated = sptep;
vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
}
if (*iterator.sptep == shadow_trap_nonpresent_pte) {
- pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ u64 base_addr = iterator.addr;
+
+ base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+ pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
iterator.level - 1,
1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return pt_write;
}
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+ char buf[1];
+ void __user *hva;
+ int r;
+
+ /* Touch the page, so send SIGBUS */
+ hva = (void __user *)gfn_to_hva(kvm, gfn);
+ r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+ kvm_release_pfn_clean(pfn);
+ if (is_hwpoison_pfn(pfn)) {
+ kvm_send_hwpoison_signal(kvm, gfn);
+ return 0;
+ } else if (is_fault_pfn(pfn))
+ return -EFAULT;
+
+ return 1;
+}
+
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
@@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn = gfn_to_pfn(vcpu->kvm, gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
@@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
- if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (!sp->root_count && sp->role.invalid) {
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ }
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
if (!sp->root_count && sp->role.invalid)
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
}
@@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
int ret = 0;
if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
ret = 1;
}
@@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = 0;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
root_gfn = i << 30;
}
spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, direct,
ACC_ALL, NULL);
@@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+ if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ /* mmu.free() should set root_hpa = INVALID_PAGE */
vcpu->arch.mmu.free(vcpu);
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- }
}
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
r = mmu_topup_memory_caches(vcpu);
if (r)
goto out;
- spin_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_free_some_pages(vcpu);
- spin_unlock(&vcpu->kvm->mmu_lock);
r = mmu_alloc_roots(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
mmu_sync_roots(vcpu);
@@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- rmap_remove(vcpu->kvm, spte);
+ drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
+ if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+ return;
+
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new)
return (old & ~new & PT64_PERM_MASK) != 0;
}
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+ bool remote_flush, bool local_flush)
{
- if (need_remote_flush(old, new))
+ if (zap_page)
+ return;
+
+ if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
- else
+ else if (local_flush)
kvm_mmu_flush_tlb(vcpu);
}
@@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
+ union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
- struct hlist_node *node, *n;
- struct hlist_head *bucket;
- unsigned index;
+ struct hlist_node *node;
+ LIST_HEAD(invalid_list);
u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int npte;
int r;
int invlpg_counter;
+ bool remote_flush, local_flush, zap_page;
+
+ zap_page = remote_flush = local_flush = false;
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pte_updated = NULL;
}
}
- index = kvm_page_table_hashfn(gfn);
- bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-restart:
- hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
- continue;
+ mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+ for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
pte_size = sp->role.cr4_pae ? 8 : 4;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
misaligned |= bytes < 4;
@@ -2697,8 +2869,8 @@ restart:
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- if (kvm_mmu_zap_page(vcpu->kvm, sp))
- goto restart;
+ zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -2722,16 +2894,22 @@ restart:
if (quadrant != sp->role.quadrant)
continue;
}
+ local_flush = true;
spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- if (gentry)
+ if (gentry &&
+ !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+ & mask.word))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
- mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+ if (!remote_flush && need_remote_flush(entry, *spte))
+ remote_flush = true;
++spte;
}
}
+ mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
- while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+ int free_pages;
+ LIST_HEAD(invalid_list);
+
+ free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+ while (free_pages < KVM_REFILL_PAGES &&
!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+ &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
return 1;
case EMULATE_DO_MMIO:
++vcpu->stat.mmio_exits;
- return 0;
+ /* fall through */
case EMULATE_FAIL:
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
return 0;
default:
BUG();
@@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
+ if (is_writable_pte(pt[i]))
pt[i] &= ~PT_WRITABLE_MASK;
}
kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
void kvm_mmu_zap_all(struct kvm *kvm)
{
struct kvm_mmu_page *sp, *node;
+ LIST_HEAD(invalid_list);
spin_lock(&kvm->mmu_lock);
restart:
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- if (kvm_mmu_zap_page(kvm, sp))
+ if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
goto restart;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
-
- kvm_flush_remote_tlbs(kvm);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_zap_page(kvm, page) + 1;
+ return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
list_for_each_entry(kvm, &vm_list, vm_list) {
int npages, idx, freed_pages;
+ LIST_HEAD(invalid_list);
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
kvm->arch.n_free_mmu_pages;
cache_count += npages;
if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+ freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
cache_count -= freed_pages;
kvm_freed = kvm;
}
nr_to_scan--;
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
}
@@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
return 1;
}
@@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
struct kvm_mmu_page *rev_sp;
gfn_t gfn;
- if (*sptep & PT_WRITABLE_MASK) {
+ if (is_writable_pte(*sptep)) {
rev_sp = page_header(__pa(sptep));
- gfn = rev_sp->gfns[sptep - rev_sp->spt];
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
@@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
- rev_sp->role.level);
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
if (!*rmapp) {
if (!printk_ratelimit())
return;
@@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
if (!(ent & PT_PRESENT_MASK))
continue;
- if (!(ent & PT_WRITABLE_MASK))
+ if (!is_writable_pte(ent))
continue;
inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
}
@@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (sp->unsync)
continue;
- gfn = unalias_gfn(vcpu->kvm, sp->gfn);
- slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
spte = rmap_next(vcpu->kvm, rmapp, NULL);
while (spte) {
- if (*spte & PT_WRITABLE_MASK)
+ if (is_writable_pte(*spte))
printk(KERN_ERR "%s: (%s) shadow page has "
"writable mappings: gfn %lx role %x\n",
__func__, audit_msg, sp->gfn,
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
TP_ARGS(sp)
);
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_PROTO(struct kvm_mmu_page *sp),
TP_ARGS(sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
* MMU support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
{
pt_element_t pte;
gfn_t table_gfn;
- unsigned index, pt_access, pte_access;
+ unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- int rsvd_fault = 0;
+ bool eperm, present, rsvd_fault;
trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
fetch_fault);
walk:
+ present = true;
+ eperm = rsvd_fault = false;
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
#if PTTYPE == 64
if (!is_long_mode(vcpu)) {
pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ goto error;
+ }
--walker->level;
}
#endif
@@ -150,37 +155,42 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
- goto not_present;
+ if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+ present = false;
+ break;
+ }
trace_kvm_mmu_paging_element(pte, walker->level);
- if (!is_present_gpte(pte))
- goto not_present;
+ if (!is_present_gpte(pte)) {
+ present = false;
+ break;
+ }
- rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
- if (rsvd_fault)
- goto access_error;
+ if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+ rsvd_fault = true;
+ break;
+ }
if (write_fault && !is_writable_pte(pte))
if (user_fault || is_write_protection(vcpu))
- goto access_error;
+ eperm = true;
if (user_fault && !(pte & PT_USER_MASK))
- goto access_error;
+ eperm = true;
#if PTTYPE == 64
if (fetch_fault && (pte & PT64_NX_MASK))
- goto access_error;
+ eperm = true;
#endif
- if (!(pte & PT_ACCESSED_MASK)) {
+ if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
index, pte, pte|PT_ACCESSED_MASK))
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_ACCESSED_MASK;
}
@@ -213,15 +223,18 @@ walk:
--walker->level;
}
+ if (!present || eperm || rsvd_fault)
+ goto error;
+
if (write_fault && !is_dirty_gpte(pte)) {
bool ret;
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
- mark_page_dirty(vcpu->kvm, table_gfn);
ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
pte|PT_DIRTY_MASK);
if (ret)
goto walk;
+ mark_page_dirty(vcpu->kvm, table_gfn);
pte |= PT_DIRTY_MASK;
walker->ptes[walker->level - 1] = pte;
}
@@ -229,22 +242,18 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, pt_access, pte_access);
+ __func__, (u64)pte, pte_access, pt_access);
return 1;
-not_present:
+error:
walker->error_code = 0;
- goto err;
-
-access_error:
- walker->error_code = PFERR_PRESENT_MASK;
-
-err:
+ if (present)
+ walker->error_code |= PFERR_PRESENT_MASK;
if (write_fault)
walker->error_code |= PFERR_WRITE_MASK;
if (user_fault)
walker->error_code |= PFERR_USER_MASK;
- if (fetch_fault)
+ if (fetch_fault && is_nx(vcpu))
walker->error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
return 0;
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!is_present_gpte(gpte)) {
- if (page->unsync)
+ if (sp->unsync)
new_spte = shadow_trap_nonpresent_pte;
else
new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
}
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
* we call mmu_set_spte() with reset_host_protection = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
- mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ int r;
+ pt_element_t curr_pte;
+
+ r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+ &curr_pte, sizeof(curr_pte));
+ return r || curr_pte != gw->ptes[level - 1];
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
- struct kvm_mmu_page *shadow_page;
- u64 spte, *sptep = NULL;
- int direct;
- gfn_t table_gfn;
- int r;
- int level;
- pt_element_t curr_pte;
- struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp = NULL;
+ bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+ int top_level;
+ unsigned direct_access;
+ struct kvm_shadow_walk_iterator it;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- for_each_shadow_entry(vcpu, addr, iterator) {
- level = iterator.level;
- sptep = iterator.sptep;
- if (iterator.level == hlevel) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- ptwrite, level,
- gw->gfn, pfn, false, true);
- break;
- }
+ direct_access = gw->pt_access & gw->pte_access;
+ if (!dirty)
+ direct_access &= ~ACC_WRITE_MASK;
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
- continue;
+ top_level = vcpu->arch.mmu.root_level;
+ if (top_level == PT32E_ROOT_LEVEL)
+ top_level = PT32_ROOT_LEVEL;
+ /*
+ * Verify that the top-level gpte is still there. Since the page
+ * is a root page, it is either write protected (and cannot be
+ * changed from now on) or it is invalid (in which case, we don't
+ * really care if it changes underneath us after this point).
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, top_level))
+ goto out_gpte_changed;
- if (is_large_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
+ for (shadow_walk_init(&it, vcpu, addr);
+ shadow_walk_okay(&it) && it.level > gw->level;
+ shadow_walk_next(&it)) {
+ gfn_t table_gfn;
- if (level <= gw->level) {
- int delta = level - gw->level + 1;
- direct = 1;
- if (!is_dirty_gpte(gw->ptes[level - delta]))
- access &= ~ACC_WRITE_MASK;
- table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
- /* advance table_gfn when emulating 1gb pages with 4k */
- if (delta == 0)
- table_gfn += PT_INDEX(addr, level);
- access &= gw->pte_access;
- } else {
- direct = 0;
- table_gfn = gw->table_gfn[level - 2];
- }
- shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
- direct, access, sptep);
- if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(shadow_page, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
+ drop_large_spte(vcpu, it.sptep);
+
+ sp = NULL;
+ if (!is_shadow_present_pte(*it.sptep)) {
+ table_gfn = gw->table_gfn[it.level - 2];
+ sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+ false, access, it.sptep);
}
- spte = __pa(shadow_page->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
+ /*
+ * Verify that the gpte in the page we've just write
+ * protected is still there.
+ */
+ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+ goto out_gpte_changed;
+
+ if (sp)
+ link_shadow_page(it.sptep, sp);
}
- return sptep;
+ for (;
+ shadow_walk_okay(&it) && it.level > hlevel;
+ shadow_walk_next(&it)) {
+ gfn_t direct_gfn;
+
+ validate_direct_spte(vcpu, it.sptep, direct_access);
+
+ drop_large_spte(vcpu, it.sptep);
+
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+ sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+ true, direct_access, it.sptep);
+ link_shadow_page(it.sptep, sp);
+ }
+
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ user_fault, write_fault, dirty, ptwrite, it.level,
+ gw->gfn, pfn, false, true);
+
+ return it.sptep;
+
+out_gpte_changed:
+ if (sp)
+ kvm_mmu_put_page(sp, it.sptep);
+ kvm_release_pfn_clean(pfn);
+ return NULL;
}
/*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
/* mmio */
- if (is_error_pfn(pfn)) {
- pgprintk("gfn %lx is mmio\n", walker.gfn);
- kvm_release_pfn_clean(pfn);
- return 1;
- }
+ if (is_error_pfn(pfn))
+ return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
+ (void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -464,6 +493,7 @@ out_unlock:
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
+ struct kvm_mmu_page *sp;
gpa_t pte_gpa = -1;
int level;
u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
level = iterator.level;
sptep = iterator.sptep;
+ sp = page_header(__pa(sptep));
if (is_last_spte(*sptep, level)) {
- struct kvm_mmu_page *sp = page_header(__pa(sptep));
int offset, shift;
+ if (!sp->unsync)
+ break;
+
shift = PAGE_SHIFT -
(PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
if (is_shadow_present_pte(*sptep)) {
- rmap_remove(vcpu->kvm, sptep);
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
+ drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte);
need_flush = 1;
- }
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ } else
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
break;
}
- if (!is_shadow_present_pte(*sptep))
+ if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ bool clear_unsync)
{
int i, offset, nr_present;
bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
offset = nr_present = 0;
+ /* direct kvm_mmu_page can not be unsync. */
+ BUG_ON(sp->role.direct);
+
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
- gfn_t gfn = sp->gfns[i];
+ gfn_t gfn;
if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
- !(gpte & PT_ACCESSED_MASK)) {
+ gfn = gpte_to_gfn(gpte);
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+ || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+ || !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
- rmap_remove(vcpu->kvm, &sp->spt[i]);
- if (is_present_gpte(gpte))
+ if (is_present_gpte(gpte) || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
- __set_spte(&sp->spt[i], nonpresent);
+ drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..bc5b9b8d4a33 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
@@ -130,7 +131,7 @@ static struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
- { .index = MSR_K6_STAR, .always = true },
+ { .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ vcpu->arch.efer = efer;
if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
- vcpu->arch.efer = efer;
}
static int is_external_interrupt(u32 info)
@@ -383,8 +384,7 @@ static void svm_init_erratum_383(void)
int err;
u64 val;
- /* Only Fam10h is affected */
- if (boot_cpu_data.x86 != 0x10)
+ if (!cpu_has_amd_erratum(amd_erratum_383))
return;
/* Use _safe variants to not break nested virtualization */
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME);
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
for_each_possible_cpu(cpu) {
@@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm)
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
*/
svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+ (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->asid_generation = 0;
init_vmcb(svm);
- fx_init(&svm->vcpu);
+ err = fx_init(&svm->vcpu);
+ if (err)
+ goto free_page4;
+
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
return &svm->vcpu;
+free_page4:
+ __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
return;
}
@@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = hsave->save.cr3;
svm->vcpu.arch.cr3 = hsave->save.cr3;
} else {
- kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
} else
- kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+ (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
/* Guest paging mode is active - reset mmu */
kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm)
static int invlpg_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
- return 1;
+ return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
}
static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
*data = tsc_offset + native_read_tsc();
break;
}
- case MSR_K6_STAR:
+ case MSR_STAR:
*data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
@@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
break;
}
- case MSR_K6_STAR:
+ case MSR_STAR:
svm->vmcb->save.star = data;
break;
#ifdef CONFIG_X86_64
@@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_NPF] = pf_interception,
};
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
+
+ pr_err("VMCB Control Area:\n");
+ pr_err("cr_read: %04x\n", control->intercept_cr_read);
+ pr_err("cr_write: %04x\n", control->intercept_cr_write);
+ pr_err("dr_read: %04x\n", control->intercept_dr_read);
+ pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("exceptions: %08x\n", control->intercept_exceptions);
+ pr_err("intercepts: %016llx\n", control->intercept);
+ pr_err("pause filter count: %d\n", control->pause_filter_count);
+ pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa);
+ pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa);
+ pr_err("tsc_offset: %016llx\n", control->tsc_offset);
+ pr_err("asid: %d\n", control->asid);
+ pr_err("tlb_ctl: %d\n", control->tlb_ctl);
+ pr_err("int_ctl: %08x\n", control->int_ctl);
+ pr_err("int_vector: %08x\n", control->int_vector);
+ pr_err("int_state: %08x\n", control->int_state);
+ pr_err("exit_code: %08x\n", control->exit_code);
+ pr_err("exit_info1: %016llx\n", control->exit_info_1);
+ pr_err("exit_info2: %016llx\n", control->exit_info_2);
+ pr_err("exit_int_info: %08x\n", control->exit_int_info);
+ pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err);
+ pr_err("nested_ctl: %lld\n", control->nested_ctl);
+ pr_err("nested_cr3: %016llx\n", control->nested_cr3);
+ pr_err("event_inj: %08x\n", control->event_inj);
+ pr_err("event_inj_err: %08x\n", control->event_inj_err);
+ pr_err("lbr_ctl: %lld\n", control->lbr_ctl);
+ pr_err("next_rip: %016llx\n", control->next_rip);
+ pr_err("VMCB State Save Area:\n");
+ pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->es.selector, save->es.attrib,
+ save->es.limit, save->es.base);
+ pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->cs.selector, save->cs.attrib,
+ save->cs.limit, save->cs.base);
+ pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ss.selector, save->ss.attrib,
+ save->ss.limit, save->ss.base);
+ pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ds.selector, save->ds.attrib,
+ save->ds.limit, save->ds.base);
+ pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->fs.selector, save->fs.attrib,
+ save->fs.limit, save->fs.base);
+ pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gs.selector, save->gs.attrib,
+ save->gs.limit, save->gs.base);
+ pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->gdtr.selector, save->gdtr.attrib,
+ save->gdtr.limit, save->gdtr.base);
+ pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->ldtr.selector, save->ldtr.attrib,
+ save->ldtr.limit, save->ldtr.base);
+ pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->idtr.selector, save->idtr.attrib,
+ save->idtr.limit, save->idtr.base);
+ pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n",
+ save->tr.selector, save->tr.attrib,
+ save->tr.limit, save->tr.base);
+ pr_err("cpl: %d efer: %016llx\n",
+ save->cpl, save->efer);
+ pr_err("cr0: %016llx cr2: %016llx\n",
+ save->cr0, save->cr2);
+ pr_err("cr3: %016llx cr4: %016llx\n",
+ save->cr3, save->cr4);
+ pr_err("dr6: %016llx dr7: %016llx\n",
+ save->dr6, save->dr7);
+ pr_err("rip: %016llx rflags: %016llx\n",
+ save->rip, save->rflags);
+ pr_err("rsp: %016llx rax: %016llx\n",
+ save->rsp, save->rax);
+ pr_err("star: %016llx lstar: %016llx\n",
+ save->star, save->lstar);
+ pr_err("cstar: %016llx sfmask: %016llx\n",
+ save->cstar, save->sfmask);
+ pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n",
+ save->kernel_gs_base, save->sysenter_cs);
+ pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n",
+ save->sysenter_esp, save->sysenter_eip);
+ pr_err("gpat: %016llx dbgctl: %016llx\n",
+ save->g_pat, save->dbgctl);
+ pr_err("br_from: %016llx br_to: %016llx\n",
+ save->br_from, save->br_to);
+ pr_err("excp_from: %016llx excp_to: %016llx\n",
+ save->last_excp_from, save->last_excp_to);
+
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
+ pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+ dump_vmcb(vcpu);
return 0;
}
@@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
{
struct vmcb_control_area *control;
- trace_kvm_inj_virq(irq);
-
- ++svm->vcpu.stat.irq_injections;
control = &svm->vmcb->control;
control->int_vector = irq;
control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
BUG_ON(!(gif_set(svm)));
+ trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+ ++vcpu->stat.irq_injections;
+
svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
}
@@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void)
return false;
}
+static bool svm_has_wbinvd_exit(void)
+{
+ return true;
+}
+
static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
+
+ .has_wbinvd_exit = svm_has_wbinvd_exit,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
atomic_inc(&ktimer->pending);
/* FIXME: this code should not know anything about vcpus */
- set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
}
if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..49b25eee25ac 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -36,6 +37,8 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
#include "trace.h"
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
@@ -231,14 +240,14 @@ static u64 host_efer;
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
- * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
+ * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
- MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
+ MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+ return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+ return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_WBINVD_EXITING;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
vmcs, phys_addr);
}
+static void vmcs_load(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(vmcs);
+ u8 error;
+
+ asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+ : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "cc", "memory");
+ if (error)
+ printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+ vmcs, phys_addr);
+}
+
static void __vcpu_clear(void *arg)
{
struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
}
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
{
if (vmx->vpid == 0)
return;
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+ if (cpu_has_vmx_invvpid_single())
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+ if (cpu_has_vmx_invvpid_global())
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+ if (cpu_has_vmx_invvpid_single())
+ vpid_sync_vcpu_single(vmx);
+ else
+ vpid_sync_vcpu_global();
}
static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
}
#endif
+ if (current_thread_info()->status & TS_USEDFPU)
+ clts();
+ load_gdt(&__get_cpu_var(host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u64 phys_addr = __pa(vmx->vmcs);
u64 tsc_this, delta, new_offset;
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- if (vcpu->cpu != cpu) {
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+ else if (vcpu->cpu != cpu)
vcpu_clear(vmx);
- kvm_migrate_timers(vcpu);
- set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
- local_irq_disable();
- list_add(&vmx->local_vcpus_link,
- &per_cpu(vcpus_on_cpu, cpu));
- local_irq_enable();
- }
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
- u8 error;
-
per_cpu(current_vmcs, cpu) = vmx->vmcs;
- asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
- : "cc");
- if (error)
- printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
- vmx->vmcs, phys_addr);
+ vmcs_load(vmx->vmcs);
}
if (vcpu->cpu != cpu) {
struct desc_ptr dt;
unsigned long sysenter_esp;
+ kvm_migrate_timers(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ local_irq_disable();
+ list_add(&vmx->local_vcpus_link,
+ &per_cpu(vcpus_on_cpu, cpu));
+ local_irq_enable();
+
vcpu->cpu = cpu;
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
+ if (!vmm_exclusive) {
+ __vcpu_clear(to_vmx(vcpu));
+ kvm_cpu_vmxoff();
+ }
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx)
if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
- * MSR_K6_STAR is only needed on long mode guests, and only
+ * MSR_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
- index = __find_msr_index(vmx, MSR_K6_STAR);
+ index = __find_msr_index(vmx, MSR_STAR);
if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
/* locked but not enabled */
}
+static void kvm_cpu_vmxon(u64 addr)
+{
+ asm volatile (ASM_VMX_VMXON_RAX
+ : : "a"(&addr), "m"(addr)
+ : "memory", "cc");
+}
+
static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
- asm volatile (ASM_VMX_VMXON_RAX
- : : "a"(&phys_addr), "m"(phys_addr)
- : "memory", "cc");
- ept_sync_global();
+ if (vmm_exclusive) {
+ kvm_cpu_vmxon(phys_addr);
+ ept_sync_global();
+ }
+
+ store_gdt(&__get_cpu_var(host_gdt));
return 0;
}
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static void hardware_disable(void *garbage)
{
- vmclear_local_vcpus();
- kvm_cpu_vmxoff();
+ if (vmm_exclusive) {
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+ }
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
- if (!cpu_has_vmx_ept()) {
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
}
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
gfn_t base_gfn;
slots = kvm_memslots(kvm);
- base_gfn = kvm->memslots->memslots[0].base_gfn +
+ base_gfn = slots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
- vpid_sync_vcpu_all(to_vmx(vcpu));
- if (enable_ept)
+ vpid_sync_context(to_vmx(vcpu));
+ if (enable_ept) {
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+ }
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
- vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
+ vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */
vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
static int init_rmode(struct kvm *kvm)
{
+ int idx, ret = 0;
+
+ idx = srcu_read_lock(&kvm->srcu);
if (!init_rmode_tss(kvm))
- return 0;
+ goto exit;
if (!init_rmode_identity_map(kvm))
- return 0;
- return 1;
+ goto exit;
+
+ ret = 1;
+exit:
+ srcu_read_unlock(&kvm->srcu, idx);
+ return ret;
}
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 msr;
- int ret, idx;
+ int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
- idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- fx_init(&vmx->vcpu);
+ ret = fx_init(&vmx->vcpu);
+ if (ret != 0)
+ goto out;
seg_setup(VCPU_SREG_CS);
/*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
- vpid_sync_vcpu_all(vmx);
+ vpid_sync_context(vmx);
ret = 0;
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->emulation_required = 0;
out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
}
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
{
if (!cpu_has_virtual_nmis())
return to_vmx(vcpu)->soft_vnmi_blocked;
- else
- return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- GUEST_INTR_STATE_NMI);
+ return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
}
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ skip_emulated_instruction(vcpu);
+}
+
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
int cr;
int reg;
+ int err;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(cr, val);
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr0(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 3:
- kvm_set_cr3(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr3(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 4:
- kvm_set_cr4(vcpu, val);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr4(vcpu, val);
+ complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
- /* TODO: Add support for VT-d/pass-through device */
+ kvm_emulate_wbinvd(vcpu);
return 1;
}
-static int handle_apic_access(struct kvm_vcpu *vcpu)
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification;
- enum emulation_result er;
- unsigned long offset;
+ u64 new_bv = kvm_read_edx_eax(vcpu);
+ u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- offset = exit_qualification & 0xffful;
-
- er = emulate_instruction(vcpu, 0, 0, 0);
-
- if (er != EMULATE_DONE) {
- printk(KERN_ERR
- "Fail to handle apic access vmexit! Offset is 0x%lx\n",
- offset);
- return -ENOEXEC;
- }
+ if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+ skip_emulated_instruction(vcpu);
return 1;
}
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
static int handle_task_switch(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
goto out;
}
- if (err != EMULATE_DONE) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
- ret = 0;
- goto out;
- }
+ if (err != EMULATE_DONE)
+ return 0;
if (signal_pending(current))
goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ vcpu->run->fail_entry.hardware_entry_failure_reason
+ = exit_reason;
+ return 0;
+ }
+
if (unlikely(vmx->fail)) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- /*
- * Loading guest fpu may have cleared host cr0.ts
- */
- vmcs_writel(HOST_CR0, read_cr0());
-
asm(
/* Store host registers */
"push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vmx);
}
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+ u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxon(phys_addr);
+
+ vmcs_clear(vmcs);
+
+ if (!vmm_exclusive)
+ kvm_cpu_vmxoff();
+}
+
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx->vmcs)
goto free_msrs;
- vmcs_clear(vmx->vmcs);
+ vmcs_init(vmx->vmcs);
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.rdtscp_supported = vmx_rdtscp_supported,
.set_supported_cpuid = vmx_set_supported_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..25f19078b321 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright (C) 2008 Qumranet, Inc.
* Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
@@ -41,17 +42,19 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/perf_event.h>
+#include <linux/uaccess.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
#include <asm/debugreg.h>
-#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -62,6 +65,7 @@
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
+ | X86_CR4_OSXSAVE \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
return changed;
}
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+ X86_CR0_CD | X86_CR0_NW;
+
cr0 |= X86_CR0_ET;
#ifdef CONFIG_X86_64
- if (cr0 & 0xffffffff00000000UL) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr0 & 0xffffffff00000000UL)
+ return 1;
#endif
cr0 &= ~CR0_RESERVED_BITS;
- if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+ return 1;
- if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+ return 1;
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
- if (!is_pae(vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!is_pae(vcpu))
+ return 1;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l) {
- kvm_inject_gp(vcpu, 0);
- return;
-
- }
+ if (cs_l)
+ return 1;
} else
#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
-
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+ return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
- kvm_mmu_reset_context(vcpu);
- return;
+ if ((cr0 ^ old_cr0) & update_bits)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+ (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ u64 xcr0;
- if (cr4 & CR4_RESERVED_BITS) {
+ /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
+ if (index != XCR_XFEATURE_ENABLED_MASK)
+ return 1;
+ xcr0 = xcr;
+ if (kvm_x86_ops->get_cpl(vcpu) != 0)
+ return 1;
+ if (!(xcr0 & XSTATE_FP))
+ return 1;
+ if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+ return 1;
+ if (xcr0 & ~host_xcr0)
+ return 1;
+ vcpu->arch.xcr0 = xcr0;
+ vcpu->guest_xcr0_loaded = 0;
+ return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+ if (__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
return;
+
+ /* Update OSXSAVE bit */
+ if (cpu_has_xsave && best->function == 0x1) {
+ best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+ if (cr4 & CR4_RESERVED_BITS)
+ return 1;
+
+ if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+ return 1;
if (is_long_mode(vcpu)) {
- if (!(cr4 & X86_CR4_PAE)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (!(cr4 & X86_CR4_PAE))
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ && !load_pdptrs(vcpu, vcpu->arch.cr3))
+ return 1;
+
+ if (cr4 & X86_CR4_VMXE)
+ return 1;
- if (cr4 & X86_CR4_VMXE) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
kvm_x86_ops->set_cr4(vcpu, cr4);
- vcpu->arch.cr4 = cr4;
- kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & pdptr_bits)
+ kvm_mmu_reset_context(vcpu);
+
+ if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
- return;
+ return 0;
}
if (is_long_mode(vcpu)) {
- if (cr3 & CR3_L_MODE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_L_MODE_RESERVED_BITS)
+ return 1;
} else {
if (is_pae(vcpu)) {
- if (cr3 & CR3_PAE_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
- if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr3 & CR3_PAE_RESERVED_BITS)
+ return 1;
+ if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+ return 1;
}
/*
* We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
* to debug) behavior on the guest side.
*/
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
- kvm_inject_gp(vcpu, 0);
- else {
- vcpu->arch.cr3 = cr3;
- vcpu->arch.mmu.new_cr3(vcpu);
- }
+ return 1;
+ vcpu->arch.cr3 = cr3;
+ vcpu->arch.mmu.new_cr3(vcpu);
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- if (cr8 & CR8_RESERVED_BITS) {
- kvm_inject_gp(vcpu, 0);
- return;
- }
+ if (cr8 & CR8_RESERVED_BITS)
+ return 1;
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
+ return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+ if (__kvm_set_cr8(vcpu, cr8))
+ kvm_inject_gp(vcpu, 0);
}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_get_cr8);
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
switch (dr) {
case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
case 6:
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return 1; /* #UD */
/* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
+ if (val & 0xffffffff00000000ULL)
+ return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
return 0;
}
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+ int res;
+
+ res = __kvm_set_dr(vcpu, dr, val);
+ if (res > 0)
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ else if (res < 0)
+ kvm_inject_gp(vcpu, 0);
+
+ return res;
+}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
case 6:
*val = vcpu->arch.dr6;
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
return 1;
- }
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_get_dr);
-static inline u32 bit(int bitno)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
- return 1 << (bitno & 31);
+ if (_kvm_get_dr(vcpu, dr, val)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ return 0;
}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -671,7 +733,7 @@ static u32 msrs_to_save[] = {
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_K6_STAR,
+ MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
};
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
+ u64 old_efer = vcpu->arch.efer;
+
if (efer & efer_reserved_bits)
return 1;
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
- vcpu->arch.efer = efer;
-
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
+ /* Update reserved bits */
+ if ((efer ^ old_efer) & EFER_NX)
+ kvm_mmu_reset_context(vcpu);
+
return 0;
}
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
if (!vcpu->time_page)
return 0;
- set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+ kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
return 1;
}
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
{
int i, idx;
- vcpu_load(vcpu);
-
idx = srcu_read_lock(&vcpu->kvm->srcu);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
srcu_read_unlock(&vcpu->kvm->srcu, idx);
- vcpu_put(vcpu);
-
return i;
}
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
+ case KVM_CAP_XSAVE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
break;
+ case KVM_CAP_XCRS:
+ r = cpu_has_xsave;
+ break;
default:
r = 0;
break;
@@ -1717,8 +1785,28 @@ out:
return r;
}
+static void wbinvd_ipi(void *garbage)
+{
+ wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.iommu_domain &&
+ !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ /* Address WBINVD may be executed by guest */
+ if (need_emulate_wbinvd(vcpu)) {
+ if (kvm_x86_ops->has_wbinvd_exit())
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+ else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+ smp_call_function_single(vcpu->cpu,
+ wbinvd_ipi, NULL, 1);
+ }
+
kvm_x86_ops->vcpu_load(vcpu, cpu);
if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
- kvm_put_guest_fpu(vcpu);
kvm_x86_ops->vcpu_put(vcpu);
+ kvm_put_guest_fpu(vcpu);
}
static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
if (copy_from_user(cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry)))
goto out_free;
- vcpu_load(vcpu);
for (i = 0; i < cpuid->nent; i++) {
vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
out_free:
vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
- vcpu_load(vcpu);
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- vcpu_put(vcpu);
+ update_cpuid(vcpu);
return 0;
out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
{
int r;
- vcpu_load(vcpu);
r = -E2BIG;
if (cpuid->nent < vcpu->arch.cpuid_nent)
goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
out:
cpuid->nent = vcpu->arch.cpuid_nent;
- vcpu_put(vcpu);
return r;
}
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
const u32 kvm_supported_word4_x86_features =
- F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
0 /* DS-CPL, VMX, SMX, EST */ |
0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
0 /* Reserved, DCA */ | F(XMM4_1) |
F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved, XSAVE, OSXSAVE */;
+ 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
/* cpuid 0x80000001.ecx */
const u32 kvm_supported_word6_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
switch (function) {
case 0:
- entry->eax = min(entry->eax, (u32)0xb);
+ entry->eax = min(entry->eax, (u32)0xd);
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 0xd: {
+ int i;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ for (i = 1; *nent < maxnent; ++i) {
+ if (entry[i - 1].eax == 0 && i != 2)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
char signature[12] = "KVMKVMKVM\0\0";
u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
- vcpu_put(vcpu);
return 0;
}
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- vcpu_load(vcpu);
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
update_cr8_intercept(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return -EINVAL;
if (irqchip_in_kernel(vcpu->kvm))
return -ENXIO;
- vcpu_load(vcpu);
kvm_queue_interrupt(vcpu, irq->irq, false);
- vcpu_put(vcpu);
-
return 0;
}
static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu);
kvm_inject_nmi(vcpu);
- vcpu_put(vcpu);
return 0;
}
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
int r;
unsigned bank_num = mcg_cap & 0xff, bank;
- vcpu_load(vcpu);
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
out:
- vcpu_put(vcpu);
return r;
}
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
- set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0;
}
if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
- vcpu_load(vcpu);
-
events->exception.injected =
vcpu->arch.exception.pending &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW);
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
- vcpu_load(vcpu);
-
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
vcpu->arch.sipi_vector = events->sipi_vector;
- vcpu_put(vcpu);
-
return 0;
}
static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
struct kvm_debugregs *dbgregs)
{
- vcpu_load(vcpu);
-
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
dbgregs->dr6 = vcpu->arch.dr6;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
-
- vcpu_put(vcpu);
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
if (dbgregs->flags)
return -EINVAL;
- vcpu_load(vcpu);
-
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
vcpu->arch.dr6 = dbgregs->dr6;
vcpu->arch.dr7 = dbgregs->dr7;
- vcpu_put(vcpu);
+ return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ if (cpu_has_xsave)
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->xsave,
+ sizeof(struct xsave_struct));
+ else {
+ memcpy(guest_xsave->region,
+ &vcpu->arch.guest_fpu.state->fxsave,
+ sizeof(struct i387_fxsave_struct));
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+ XSTATE_FPSSE;
+ }
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+ struct kvm_xsave *guest_xsave)
+{
+ u64 xstate_bv =
+ *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+ if (cpu_has_xsave)
+ memcpy(&vcpu->arch.guest_fpu.state->xsave,
+ guest_xsave->region, sizeof(struct xsave_struct));
+ else {
+ if (xstate_bv & ~XSTATE_FPSSE)
+ return -EINVAL;
+ memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+ guest_xsave->region, sizeof(struct i387_fxsave_struct));
+ }
return 0;
}
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ if (!cpu_has_xsave) {
+ guest_xcrs->nr_xcrs = 0;
+ return;
+ }
+
+ guest_xcrs->nr_xcrs = 1;
+ guest_xcrs->flags = 0;
+ guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+ guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+ struct kvm_xcrs *guest_xcrs)
+{
+ int i, r = 0;
+
+ if (!cpu_has_xsave)
+ return -EINVAL;
+
+ if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+ return -EINVAL;
+
+ for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+ /* Only support XCR0 currently */
+ if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+ r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+ guest_xcrs->xcrs[0].value);
+ break;
+ }
+ if (r)
+ r = -EINVAL;
+ return r;
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
- struct kvm_lapic_state *lapic = NULL;
+ union {
+ struct kvm_lapic_state *lapic;
+ struct kvm_xsave *xsave;
+ struct kvm_xcrs *xcrs;
+ void *buffer;
+ } u;
+ u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
- r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+ if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
goto out;
r = 0;
break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
r = -ENOMEM;
- if (!lapic)
+ if (!u.lapic)
goto out;
r = -EFAULT;
- if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+ if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
goto out;
- r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+ r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
if (r)
goto out;
r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EFAULT;
if (copy_from_user(&mce, argp, sizeof mce))
goto out;
- vcpu_load(vcpu);
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
- vcpu_put(vcpu);
break;
}
case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
break;
}
+ case KVM_GET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XSAVE: {
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xsave)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+ break;
+ }
+ case KVM_GET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+ r = -EFAULT;
+ if (copy_to_user(argp, u.xcrs,
+ sizeof(struct kvm_xcrs)))
+ break;
+ r = 0;
+ break;
+ }
+ case KVM_SET_XCRS: {
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.xcrs)
+ break;
+
+ r = -EFAULT;
+ if (copy_from_user(u.xcrs, argp,
+ sizeof(struct kvm_xcrs)))
+ break;
+
+ r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+ break;
+ }
default:
r = -EINVAL;
}
out:
- kfree(lapic);
+ kfree(u.buffer);
return r;
}
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
return kvm->arch.n_alloc_mmu_pages;
}
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (alias->flags & KVM_ALIAS_INVALID)
- continue;
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
- int i;
- struct kvm_mem_alias *alias;
- struct kvm_mem_aliases *aliases;
-
- aliases = kvm_aliases(kvm);
-
- for (i = 0; i < aliases->naliases; ++i) {
- alias = &aliases->aliases[i];
- if (gfn >= alias->base_gfn
- && gfn < alias->base_gfn + alias->npages)
- return alias->target_gfn + gfn - alias->base_gfn;
- }
- return gfn;
-}
-
-/*
- * Set a new alias region. Aliases map a portion of physical memory into
- * another portion. This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
- struct kvm_memory_alias *alias)
-{
- int r, n;
- struct kvm_mem_alias *p;
- struct kvm_mem_aliases *aliases, *old_aliases;
-
- r = -EINVAL;
- /* General sanity checks */
- if (alias->memory_size & (PAGE_SIZE - 1))
- goto out;
- if (alias->guest_phys_addr & (PAGE_SIZE - 1))
- goto out;
- if (alias->slot >= KVM_ALIAS_SLOTS)
- goto out;
- if (alias->guest_phys_addr + alias->memory_size
- < alias->guest_phys_addr)
- goto out;
- if (alias->target_phys_addr + alias->memory_size
- < alias->target_phys_addr)
- goto out;
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out;
-
- mutex_lock(&kvm->slots_lock);
-
- /* invalidate any gfn reference in case of deletion/shrinking */
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
- aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kvm_mmu_zap_all(kvm);
- kfree(old_aliases);
-
- r = -ENOMEM;
- aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!aliases)
- goto out_unlock;
-
- memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
- p = &aliases->aliases[alias->slot];
- p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
- p->npages = alias->memory_size >> PAGE_SHIFT;
- p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
- p->flags &= ~(KVM_ALIAS_INVALID);
-
- for (n = KVM_ALIAS_SLOTS; n > 0; --n)
- if (aliases->aliases[n - 1].npages)
- break;
- aliases->naliases = n;
-
- old_aliases = kvm->arch.aliases;
- rcu_assign_pointer(kvm->arch.aliases, aliases);
- synchronize_srcu_expedited(&kvm->srcu);
- kfree(old_aliases);
- r = 0;
-
-out_unlock:
- mutex_unlock(&kvm->slots_lock);
-out:
- return r;
-}
-
static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
{
int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
unsigned long n;
unsigned long is_dirty = 0;
- unsigned long *dirty_bitmap = NULL;
mutex_lock(&kvm->slots_lock);
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
n = kvm_dirty_bitmap_bytes(memslot);
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
- memset(dirty_bitmap, 0, n);
-
for (i = 0; !is_dirty && i < n/sizeof(long); i++)
is_dirty = memslot->dirty_bitmap[i];
/* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
struct kvm_memslots *slots, *old_slots;
+ unsigned long *dirty_bitmap;
spin_lock(&kvm->mmu_lock);
kvm_mmu_slot_remove_write_access(kvm, log->slot);
spin_unlock(&kvm->mmu_lock);
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots)
- goto out_free;
+ r = -ENOMEM;
+ dirty_bitmap = vmalloc(n);
+ if (!dirty_bitmap)
+ goto out;
+ memset(dirty_bitmap, 0, n);
+ r = -ENOMEM;
+ slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ if (!slots) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
synchronize_srcu_expedited(&kvm->srcu);
dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
+
+ r = -EFAULT;
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+ vfree(dirty_bitmap);
+ goto out;
+ }
+ vfree(dirty_bitmap);
+ } else {
+ r = -EFAULT;
+ if (clear_user(log->dirty_bitmap, n))
+ goto out;
}
r = 0;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
- r = -EFAULT;
-out_free:
- vfree(dirty_bitmap);
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
union {
struct kvm_pit_state ps;
struct kvm_pit_state2 ps2;
- struct kvm_memory_alias alias;
struct kvm_pit_config pit_config;
} u;
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
goto out;
break;
}
- case KVM_SET_MEMORY_REGION: {
- struct kvm_memory_region kvm_mem;
- struct kvm_userspace_memory_region kvm_userspace_mem;
-
- r = -EFAULT;
- if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
- goto out;
- kvm_userspace_mem.slot = kvm_mem.slot;
- kvm_userspace_mem.flags = kvm_mem.flags;
- kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
- kvm_userspace_mem.memory_size = kvm_mem.memory_size;
- r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
- if (r)
- goto out;
- break;
- }
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
- case KVM_SET_MEMORY_ALIAS:
- r = -EFAULT;
- if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
- goto out;
- r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
- if (r)
- goto out;
- break;
case KVM_CREATE_IRQCHIP: {
struct kvm_pic *vpic;
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
}
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
}
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
- r = X86EMUL_UNHANDLEABLE;
+ r = X86EMUL_IO_NEEDED;
goto out;
}
@@ -3330,10 +3407,10 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 0;
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
- return X86EMUL_UNHANDLEABLE;
+ return X86EMUL_IO_NEEDED;
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- u32 error_code;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
- if (gpa == UNMAPPED_GVA) {
- kvm_inject_page_fault(vcpu, addr, error_code);
+ if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- }
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
return X86EMUL_CONTINUE;
vcpu->mmio_needed = 1;
- vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->mmio_is_write = 1;
- memcpy(vcpu->mmio_data, val, bytes);
+ vcpu->run->exit_reason = KVM_EXIT_MMIO;
+ vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+ vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+ memcpy(vcpu->run->mmio.data, val, bytes);
return X86EMUL_CONTINUE;
}
@@ -3431,6 +3506,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+ rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+ return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ vcpu);
}
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
#define CMPXCHG_TYPE(t, ptr, old, new) \
(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
+ unsigned int *error_code,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
goto emul_write;
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ kvm_release_page_clean(page);
+ goto emul_write;
+ }
kaddr = kmap_atomic(page, KM_USER0);
kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, vcpu);
+ return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
return X86EMUL_CONTINUE;
}
-int emulate_clts(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- kvm_x86_ops->fpu_activate(vcpu);
+ if (!need_emulate_wbinvd(vcpu))
+ return X86EMUL_CONTINUE;
+
+ if (kvm_x86_ops->has_wbinvd_exit()) {
+ smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+ wbinvd_ipi, NULL, 1);
+ cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+ }
+ wbinvd();
return X86EMUL_CONTINUE;
}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulate_clts(struct kvm_vcpu *vcpu)
{
- return kvm_get_dr(ctxt->vcpu, dr, dest);
+ kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+ kvm_x86_ops->fpu_activate(vcpu);
+ return X86EMUL_CONTINUE;
}
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
{
- unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
- return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+ return _kvm_get_dr(vcpu, dr, dest);
}
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
{
- u8 opcodes[4];
- unsigned long rip = kvm_rip_read(vcpu);
- unsigned long rip_linear;
-
- if (!printk_ratelimit())
- return;
- rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
- kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
- printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
- context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+ return __kvm_set_dr(vcpu, dr, value);
}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
static u64 mk_cr_64(u64 curr_cr, u32 new_val)
{
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
return value;
}
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
{
+ int res = 0;
+
switch (cr) {
case 0:
- kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+ res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
- kvm_set_cr3(vcpu, val);
+ res = kvm_set_cr3(vcpu, val);
break;
case 4:
- kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+ res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- kvm_set_cr8(vcpu, val & 0xfUL);
+ res = __kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+ res = -1;
}
+
+ return res;
}
static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
kvm_x86_ops->get_gdt(vcpu, dt);
}
+static unsigned long emulator_get_cached_segment_base(int seg,
+ struct kvm_vcpu *vcpu)
+{
+ return get_segment_base(vcpu, seg);
+}
+
static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
struct kvm_vcpu *vcpu)
{
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
kvm_set_segment(vcpu, &kvm_seg, seg);
}
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
- kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
static struct x86_emulate_ops emulate_ops = {
.read_std = kvm_read_guest_virt_system,
.write_std = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
.set_cached_descriptor = emulator_set_cached_descriptor,
.get_segment_selector = emulator_get_segment_selector,
.set_segment_selector = emulator_set_segment_selector,
+ .get_cached_segment_base = emulator_get_cached_segment_base,
.get_gdt = emulator_get_gdt,
.get_cr = emulator_get_cr,
.set_cr = emulator_set_cr,
.cpl = emulator_get_cpl,
- .set_rflags = emulator_set_rflags,
+ .get_dr = emulator_get_dr,
+ .set_dr = emulator_set_dr,
+ .set_msr = kvm_set_msr,
+ .get_msr = kvm_get_msr,
};
static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = ~0;
}
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+ u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+ /*
+ * an sti; sti; sequence only disable interrupts for the first
+ * instruction. So, if the last instruction, be it emulated or
+ * not, left the system with the INT_STI flag enabled, it
+ * means that the last instruction is an sti. We should not
+ * leave the flag on in this case. The same goes for mov ss
+ */
+ if (!(int_shadow & mask))
+ kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ if (ctxt->exception == PF_VECTOR)
+ kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+ else if (ctxt->error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ else
+ kvm_queue_exception(vcpu, ctxt->exception);
+}
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.insn_emulation_fail;
+ trace_kvm_emulate_insn_failed(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return EMULATE_FAIL;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa;
+
+ if (tdp_enabled)
+ return false;
+
+ /*
+ * if emulation was due to access to shadowed page table
+ * and it failed try to unshadow page and re-entetr the
+ * guest to let CPU execute the instruction.
+ */
+ if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+ return true;
+
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+ if (gpa == UNMAPPED_GVA)
+ return true; /* let cpu generate fault */
+
+ if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+ return true;
+
+ return false;
+}
+
int emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
u16 error_code,
int emulation_type)
{
- int r, shadow_mask;
- struct decode_cache *c;
- struct kvm_run *run = vcpu->run;
+ int r;
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
kvm_clear_exception_queue(vcpu);
vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
*/
cache_all_regs(vcpu);
- vcpu->mmio_is_write = 0;
-
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
int cs_db, cs_l;
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+ vcpu->arch.emulate_ctxt.interruptibility = 0;
+ vcpu->arch.emulate_ctxt.exception = -1;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
- c = &vcpu->arch.emulate_ctxt.decode;
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
++vcpu->stat.insn_emulation;
if (r) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE;
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_SKIP)
+ return EMULATE_FAIL;
+ return handle_emulation_failure(vcpu);
}
}
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
+ /* this is needed for vmware backdor interface to work since it
+ changes registers values during IO operation */
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
restart:
r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
- shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
- if (r == 0)
- kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+ if (r) { /* emulation failed */
+ if (reexecute_instruction(vcpu, cr2))
+ return EMULATE_DONE;
- if (vcpu->arch.pio.count) {
- if (!vcpu->arch.pio.in)
- vcpu->arch.pio.count = 0;
- return EMULATE_DO_MMIO;
+ return handle_emulation_failure(vcpu);
}
- if (r || vcpu->mmio_is_write) {
- run->exit_reason = KVM_EXIT_MMIO;
- run->mmio.phys_addr = vcpu->mmio_phys_addr;
- memcpy(run->mmio.data, vcpu->mmio_data, 8);
- run->mmio.len = vcpu->mmio_size;
- run->mmio.is_write = vcpu->mmio_is_write;
+ toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+ kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+ if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ inject_emulated_exception(vcpu);
+ return EMULATE_DONE;
}
- if (r) {
- if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
- goto done;
- if (!vcpu->mmio_needed) {
- ++vcpu->stat.insn_emulation_fail;
- trace_kvm_emulate_insn_failed(vcpu);
- kvm_report_emulation_failure(vcpu, "mmio");
- return EMULATE_FAIL;
- }
+ if (vcpu->arch.pio.count) {
+ if (!vcpu->arch.pio.in)
+ vcpu->arch.pio.count = 0;
return EMULATE_DO_MMIO;
}
- if (vcpu->mmio_is_write) {
- vcpu->mmio_needed = 0;
+ if (vcpu->mmio_needed) {
+ if (vcpu->mmio_is_write)
+ vcpu->mmio_needed = 0;
return EMULATE_DO_MMIO;
}
-done:
- if (vcpu->arch.exception.pending)
- vcpu->arch.emulate_ctxt.restart = false;
-
if (vcpu->arch.emulate_ctxt.restart)
goto restart;
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
perf_register_guest_info_callbacks(&kvm_guest_cbs);
+ if (cpu_has_xsave)
+ host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
return 0;
out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
kvm_x86_ops->patch_hypercall(vcpu, instruction);
- return emulator_write_emulated(rip, instruction, 3, vcpu);
+ return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
}
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
}
}
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+ !vcpu->guest_xcr0_loaded) {
+ /* kvm_set_xcr() also depends on this */
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+ vcpu->guest_xcr0_loaded = 1;
+ }
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->guest_xcr0_loaded) {
+ if (vcpu->arch.xcr0 != host_xcr0)
+ xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+ vcpu->guest_xcr0_loaded = 0;
+ }
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
- if (vcpu->requests)
- if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
- kvm_mmu_unload(vcpu);
-
- r = kvm_mmu_reload(vcpu);
- if (unlikely(r))
- goto out;
-
if (vcpu->requests) {
- if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+ kvm_mmu_unload(vcpu);
+ if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
- if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
kvm_write_guest_time(vcpu);
- if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
kvm_x86_ops->tlb_flush(vcpu);
- if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
- &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
r = 0;
goto out;
}
- if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+ if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
}
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ goto out;
+
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
if (vcpu->fpu_active)
kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_xcr0(vcpu);
- local_irq_disable();
+ atomic_set(&vcpu->guest_mode, 1);
+ smp_wmb();
- clear_bit(KVM_REQ_KICK, &vcpu->requests);
- smp_mb__after_clear_bit();
+ local_irq_disable();
- if (vcpu->requests || need_resched() || signal_pending(current)) {
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+ || need_resched() || signal_pending(current)) {
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
preempt_enable();
r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- set_bit(KVM_REQ_KICK, &vcpu->requests);
+ atomic_set(&vcpu->guest_mode, 0);
+ smp_wmb();
local_irq_enable();
++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+ if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
{
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- vcpu_load(vcpu);
-
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r == EMULATE_DO_MMIO) {
+ if (r != EMULATE_DONE) {
r = 0;
goto out;
}
@@ -4759,14 +4924,11 @@ out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
- vcpu_put(vcpu);
return r;
}
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->rip = kvm_rip_read(vcpu);
regs->rflags = kvm_get_rflags(vcpu);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
- vcpu_load(vcpu);
-
kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.exception.pending = false;
- vcpu_put(vcpu);
-
return 0;
}
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
{
struct desc_ptr dt;
- vcpu_load(vcpu);
-
kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
mp_state->mp_state = vcpu->arch.mp_state;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
- vcpu_load(vcpu);
vcpu->arch.mp_state = mp_state->mp_state;
- vcpu_put(vcpu);
return 0;
}
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
bool has_error_code, u32 error_code)
{
+ struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
int cs_db, cs_l, ret;
cache_all_regs(vcpu);
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
? X86EMUL_MODE_VM86 : cs_l
? X86EMUL_MODE_PROT64 : cs_db
? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ memset(c, 0, sizeof(struct decode_cache));
+ memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
if (ret)
return EMULATE_FAIL;
+ memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+ kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
return EMULATE_DONE;
}
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits;
struct desc_ptr dt;
- vcpu_load(vcpu);
-
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- vcpu_put(vcpu);
-
return 0;
}
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
unsigned long rflags;
int i, r;
- vcpu_load(vcpu);
-
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
if (vcpu->arch.exception.pending)
- goto unlock_out;
+ goto out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
else
@@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
r = 0;
-unlock_out:
- vcpu_put(vcpu);
+out:
return r;
}
/*
- * fxsave fpu state. Taken from x86_64/processor.h. To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
- u16 cwd;
- u16 swd;
- u16 twd;
- u16 fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
- u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
-/*
* Translate a guest virtual address to a guest physical address.
*/
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
gpa_t gpa;
int idx;
- vcpu_load(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
tr->valid = gpa != UNMAPPED_GVA;
tr->writeable = 1;
tr->usermode = 0;
- vcpu_put(vcpu);
return 0;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fpu->last_dp = fxsave->rdp;
memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
- vcpu_load(vcpu);
+ struct i387_fxsave_struct *fxsave =
+ &vcpu->arch.guest_fpu.state->fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
fxsave->rdp = fpu->last_dp;
memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
- vcpu_put(vcpu);
-
return 0;
}
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
{
- unsigned after_mxcsr_mask;
+ int err;
+
+ err = fpu_alloc(&vcpu->arch.guest_fpu);
+ if (err)
+ return err;
+
+ fpu_finit(&vcpu->arch.guest_fpu);
/*
- * Touch the fpu the first time in non atomic context as if
- * this is the first fpu instruction the exception handler
- * will fire before the instruction returns and it'll have to
- * allocate ram with GFP_KERNEL.
+ * Ensure guest xcr0 is valid for loading
*/
- if (!used_math())
- kvm_fx_save(&vcpu->arch.host_fx_image);
-
- /* Initialize guest FPU by resetting ours and saving into guest's */
- preempt_disable();
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_finit();
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
- preempt_enable();
+ vcpu->arch.xcr0 = XSTATE_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
- after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
- vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
- memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
- 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(fx_init);
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+ fpu_free(&vcpu->arch.guest_fpu);
+}
+
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
if (vcpu->guest_fpu_loaded)
return;
+ /*
+ * Restore all possible states in the guest,
+ * and assume host would use all available bits.
+ * Guest xcr0 would be loaded later.
+ */
+ kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
- kvm_fx_save(&vcpu->arch.host_fx_image);
- kvm_fx_restore(&vcpu->arch.guest_fx_image);
+ unlazy_fpu(current);
+ fpu_restore_checking(&vcpu->arch.guest_fpu);
trace_kvm_fpu(1);
}
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ kvm_put_guest_xcr0(vcpu);
+
if (!vcpu->guest_fpu_loaded)
return;
vcpu->guest_fpu_loaded = 0;
- kvm_fx_save(&vcpu->arch.guest_fx_image);
- kvm_fx_restore(&vcpu->arch.host_fx_image);
+ fpu_save_init(&vcpu->arch.guest_fpu);
++vcpu->stat.fpu_reload;
- set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
trace_kvm_fpu(0);
}
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
vcpu->arch.time_page = NULL;
}
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
- /* We do fxsave: this must be aligned. */
- BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
+ fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+ goto fail_free_mce_banks;
+
return 0;
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
fail_free_lapic:
kvm_free_lapic(vcpu);
fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void)
if (!kvm)
return ERR_PTR(-ENOMEM);
- kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
- if (!kvm->arch.aliases) {
- kfree(kvm);
- return ERR_PTR(-ENOMEM);
- }
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_sync_events(struct kvm *kvm)
{
kvm_free_all_assigned_devices(kvm);
+ kvm_free_pit(kvm);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_iommu_unmap_guest(kvm);
- kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm->arch.aliases);
kfree(kvm);
}
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
int user_alloc)
{
int npages = memslot->npages;
+ int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+ /* Prevent internal slot pages from being moved by fork()/COW. */
+ if (memslot->id >= KVM_MEMORY_SLOTS)
+ map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
*x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
userspace_addr = do_mmap(NULL, 0,
npages * PAGE_SIZE,
PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS,
+ map_flags,
0);
up_write(&current->mm->mmap_sem);
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
- if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+ if (atomic_xchg(&vcpu->guest_mode, 0))
smp_send_reschedule(cpu);
put_cpu();
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
- return rcu_dereference_check(kvm->arch.aliases,
- srcu_read_lock_held(&kvm->srcu)
- || lockdep_is_held(&kvm->slots_lock));
-}
-
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
OpenPOWER on IntegriCloud