diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/emulate.c | 749 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 146 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 48 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 807 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 252 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 147 | ||||
-rw-r--r-- | arch/x86/kvm/timer.c | 16 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 261 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 1176 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 7 |
16 files changed, 2173 insertions, 1473 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed6..b38bd8b92aa6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,6 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -67,6 +68,9 @@ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ +#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ +#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ +#define SrcAcc (0xd<<4) /* Source Accumulator */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -88,10 +92,6 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) -#define Src2Imm16 (4<<29) -#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be - in memory and second argument is located - immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { @@ -124,15 +124,15 @@ static u32 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x30 - 0x37 */ ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -170,20 +170,20 @@ static u32 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - DstReg | SrcMem | ModRM | Mov, Group | Group1A, + DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, + ImplicitOps | SrcMem16 | ModRM, Group | Group1A, /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16 | No64, 0, + 0, 0, SrcImmFAddr | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ - 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, + DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ @@ -215,7 +215,7 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, + SrcImmFAddr | No64, SrcImmByte | ImplicitOps, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ @@ -337,20 +337,20 @@ static u32 group_table[] = { [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM, 0, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, + SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, @@ -576,6 +576,13 @@ static u32 group2_table[] = { (_type)_x; \ }) +#define insn_fetch_arr(_arr, _size, _eip) \ +({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ + if (rc != X86EMUL_CONTINUE) \ + goto done; \ + (_eip) += (_size); \ +}) + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg) c->seg_override = seg; } -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); + return ops->get_cached_segment_base(seg, ctxt->vcpu); } static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) return 0; - return seg_base(ctxt, c->seg_override); + return seg_base(ctxt, ops, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + return seg_base(ctxt, ops, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + return seg_base(ctxt, ops, VCPU_SREG_SS); +} + +static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, + u32 error, bool valid) +{ + ctxt->exception = vec; + ctxt->error_code = error; + ctxt->error_code_valid = valid; + ctxt->restart = false; +} + +static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, GP_VECTOR, err, true); } -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, + int err) { - return seg_base(ctxt, VCPU_SREG_ES); + ctxt->cr2 = addr; + emulate_exception(ctxt, PF_VECTOR, err, true); } -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +static void emulate_ud(struct x86_emulate_ctxt *ctxt) { - return seg_base(ctxt, VCPU_SREG_SS); + emulate_exception(ctxt, UD_VECTOR, 0, false); +} + +static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) +{ + emulate_exception(ctxt, TS_VECTOR, err, true); } static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, @@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* we cannot decode insn before we complete previous rep insn */ WARN_ON(ctxt->restart); - /* Shadow copy of register state. Committed on successful emulation. */ - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); switch (mode) { case X86EMUL_MODE_REAL: @@ -1060,7 +1099,7 @@ done_prefixes: set_seg_override(c, VCPU_SREG_DS); if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, c); + c->modrm_ea += seg_override_base(ctxt, ops, c); if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; @@ -1148,6 +1187,25 @@ done_prefixes: else c->src.val = insn_fetch(u8, 1, c->eip); break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->src.bytes) { + case 1: + c->src.val = *(u8 *)c->src.ptr; + break; + case 2: + c->src.val = *(u16 *)c->src.ptr; + break; + case 4: + c->src.val = *(u32 *)c->src.ptr; + break; + case 8: + c->src.val = *(u64 *)c->src.ptr; + break; + } + break; case SrcOne: c->src.bytes = 1; c->src.val = 1; @@ -1156,10 +1214,21 @@ done_prefixes: c->src.type = OP_MEM; c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, c), + register_address(c, seg_override_base(ctxt, ops, c), c->regs[VCPU_REGS_RSI]); c->src.val = 0; break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.bytes = c->op_bytes + 2; + break; } /* @@ -1179,22 +1248,10 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = insn_fetch(u8, 1, c->eip); break; - case Src2Imm16: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 2; - c->src2.val = insn_fetch(u16, 2, c->eip); - break; case Src2One: c->src2.bytes = 1; c->src2.val = 1; break; - case Src2Mem16: - c->src2.type = OP_MEM; - c->src2.bytes = 2; - c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); - c->src2.val = 0; - break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1253,7 +1310,7 @@ done_prefixes: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt), + register_address(c, es_base(ctxt, ops), c->regs[VCPU_REGS_RDI]); c->dst.val = 0; break; @@ -1263,6 +1320,37 @@ done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static int read_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long addr, void *dest, unsigned size) +{ + int rc; + struct read_cache *mc = &ctxt->decode.mem_read; + u32 err; + + while (size) { + int n = min(size, 8u); + size -= n; + if (mc->pos < mc->end) + goto read_cached; + + rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, addr, err); + if (rc != X86EMUL_CONTINUE) + return rc; + mc->end += n; + + read_cached: + memcpy(dest, mc->data + mc->pos, n); + mc->pos += n; + dest += n; + addr += n; + } + return X86EMUL_CONTINUE; +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, get_descriptor_table_ptr(ctxt, ops, selector, &dt); if (dt.size < index * 8 + 7) { - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + emulate_gp(ctxt, selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(ctxt->vcpu, addr, err); + emulate_pf(ctxt, addr, err); return ret; } @@ -1481,11 +1569,70 @@ load: ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: - kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); + emulate_exception(ctxt, err_vec, err_code, true); return X86EMUL_PROPAGATE_FAULT; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + u32 err; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + &err, + ctxt->vcpu); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt, + (unsigned long)c->dst.ptr, err); + if (rc != X86EMUL_CONTINUE) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return X86EMUL_CONTINUE; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; @@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), c->regs[VCPU_REGS_RSP]); } @@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_emulated(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - dest, len, ctxt->vcpu); + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]), + dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, break; case X86EMUL_MODE_VM86: if (iopl < 3) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } change_mask |= EFLG_IF; @@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment segment; - kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); - c->src.val = segment.selector; - emulate_push(ctxt); + emulate_push(ctxt, ops); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_pusha(struct x86_emulate_ctxt *ctxt) +static int emulate_pusha(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; + int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RAX; while (reg <= VCPU_REGS_RDI) { (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt); + emulate_push(ctxt, ops); + + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + ++reg; } + + /* Disable writeback. */ + c->dst.type = OP_NONE; + + return rc; } static int emulate_popa(struct x86_emulate_ctxt *ctxt, @@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; case 6: /* push */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } return X86EMUL_CONTINUE; @@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return X86EMUL_CONTINUE; -} - -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - ctxt->interruptibility = mask; -} - static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct kvm_segment *cs, struct kvm_segment *ss) + struct x86_emulate_ops *ops, struct desc_struct *cs, + struct desc_struct *ss) { - memset(cs, 0, sizeof(struct kvm_segment)); - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct kvm_segment)); + memset(cs, 0, sizeof(struct desc_struct)); + ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ - cs->base = 0; /* flat segment */ + set_desc_base(cs, 0); /* flat segment */ cs->g = 1; /* 4kb granularity */ - cs->limit = 0xffffffff; /* 4GB limit */ + set_desc_limit(cs, 0xfffff); /* 4GB limit */ cs->type = 0x0b; /* Read, Execute, Accessed */ cs->s = 1; cs->dpl = 0; /* will be adjusted later */ - cs->present = 1; - cs->db = 1; + cs->p = 1; + cs->d = 1; - ss->unusable = 0; - ss->base = 0; /* flat segment */ - ss->limit = 0xffffffff; /* 4GB limit */ + set_desc_base(ss, 0); /* flat segment */ + set_desc_limit(ss, 0xfffff); /* 4GB limit */ ss->g = 1; /* 4kb granularity */ ss->s = 1; ss->type = 0x03; /* Read/Write, Accessed */ - ss->db = 1; /* 32bit stack segment */ + ss->d = 1; /* 32bit stack segment */ ss->dpl = 0; - ss->present = 1; + ss->p = 1; } static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; - cs.selector = (u16)(msr_data & 0xfffc); - ss.selector = (u16)(msr_data + 8); + cs_sel = (u16)(msr_data & 0xfffc); + ss_sel = (u16)(msr_data + 8); if (is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; if (is_long_mode(ctxt->vcpu)) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - kvm_x86_ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); + ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) } static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) +emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; + u16 cs_sel, ss_sel; /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } @@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } break; } ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs.selector = (u16)msr_data; - cs.selector &= ~SELECTOR_RPL_MASK; - ss.selector = cs.selector + 8; - ss.selector &= ~SELECTOR_RPL_MASK; + cs_sel = (u16)msr_data; + cs_sel &= ~SELECTOR_RPL_MASK; + ss_sel = cs_sel + 8; + ss_sel &= ~SELECTOR_RPL_MASK; if (ctxt->mode == X86EMUL_MODE_PROT64 || is_long_mode(ctxt->vcpu)) { - cs.db = 0; + cs.d = 0; cs.l = 1; } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; } static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) +emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; + struct desc_struct cs, ss; u64 msr_data; int usermode; + u16 cs_sel, ss_sel; /* inject #GP if in real mode or Virtual 8086 mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } - setup_syscalls_segments(ctxt, &cs, &ss); + setup_syscalls_segments(ctxt, ops, &cs, &ss); if ((c->rex_prefix & 0x8) != 0x0) usermode = X86EMUL_MODE_PROT64; @@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) cs.dpl = 3; ss.dpl = 3; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: - cs.selector = (u16)(msr_data + 16); + cs_sel = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = (u16)(msr_data + 24); + ss_sel = (u16)(msr_data + 24); break; case X86EMUL_MODE_PROT64: - cs.selector = (u16)(msr_data + 32); + cs_sel = (u16)(msr_data + 32); if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } - ss.selector = cs.selector + 8; - cs.db = 0; + ss_sel = cs_sel + 8; + cs.d = 0; cs.l = 1; break; } - cs.selector |= SELECTOR_RPL_MASK; - ss.selector |= SELECTOR_RPL_MASK; + cs_sel |= SELECTOR_RPL_MASK; + ss_sel |= SELECTOR_RPL_MASK; - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + c->eip = c->regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; return X86EMUL_CONTINUE; } @@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - struct kvm_segment tr_seg; + struct desc_struct tr_seg; int r; u16 io_bitmap_ptr; u8 perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; - kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); - if (tr_seg.unusable) + ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + if (!tr_seg.p) return false; - if (tr_seg.limit < 103) + if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, - NULL); + r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, + ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; - if (io_bitmap_ptr + port/8 > tr_seg.limit) + if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, - ctxt->vcpu, NULL); + r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, + &perm, 1, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } -static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - int seg) -{ - struct desc_struct desc; - if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) - return get_desc_base(&desc); - else - return ~0; -} - static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct tss_segment_16 *tss) @@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - ops->set_cr(3, tss->cr3, ctxt->vcpu); + if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; c->regs[VCPU_REGS_RAX] = tss->eax; @@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + emulate_pf(ctxt, old_tss_base, err); return ret; } @@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } @@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + emulate_pf(ctxt, new_tss_base, err); return ret; } } @@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, int ret; u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); ulong old_tss_base = - get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); + ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ @@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); return X86EMUL_PROPAGATE_FAULT; } } @@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || desc_limit < 0x2b)) { - kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, - tss_selector & 0xfffc); + emulate_ts(ctxt, tss_selector & 0xfffc); return X86EMUL_PROPAGATE_FAULT; } @@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt); + emulate_push(ctxt, ops); } return ret; @@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->eip; - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); c->dst.type = OP_NONE; rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) { - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); rc = writeback(ctxt, ops); + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; } return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; - ctxt->interruptibility = 0; - - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + ctxt->decode.mem_read.pos = 0; if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* LOCK prefix is allowed only with some instructions */ if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } @@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { string_done: ctxt->restart = false; - kvm_rip_write(ctxt->vcpu, c->eip); + ctxt->eip = c->eip; goto done; } /* The second termination condition only applies for REPE @@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->src.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val = c->src.val; } if (c->src2.type == OP_MEM) { - rc = ops->read_emulated((unsigned long)c->src2.ptr, - &c->src2.val, - c->src2.bytes, - ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, - c->dst.bytes, ctxt->vcpu); + rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -2571,7 +2650,7 @@ special_insn: emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, VCPU_SREG_ES); + emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); @@ -2583,14 +2662,14 @@ special_insn: emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, VCPU_SREG_CS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, VCPU_SREG_SS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); @@ -2602,7 +2681,7 @@ special_insn: emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, VCPU_SREG_DS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); @@ -2632,7 +2711,7 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: @@ -2641,7 +2720,9 @@ special_insn: goto done; break; case 0x60: /* pusha */ - emulate_pusha(ctxt); + rc = emulate_pusha(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); @@ -2655,14 +2736,14 @@ special_insn: break; case 0x68: /* push imm */ case 0x6a: /* push imm8 */ - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, @@ -2674,7 +2755,7 @@ special_insn: c->src.bytes = min(c->src.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], c->src.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], @@ -2707,6 +2788,7 @@ special_insn: } break; case 0x84 ... 0x85: + test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ @@ -2735,18 +2817,13 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; - case 0x8c: { /* mov r/m, sreg */ - struct kvm_segment segreg; - - if (c->modrm_reg <= VCPU_SREG_GS) - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); - else { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + case 0x8c: /* mov r/m, sreg */ + if (c->modrm_reg > VCPU_SREG_GS) { + emulate_ud(ctxt); goto done; } - c->dst.val = segreg.selector; + c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; - } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; @@ -2757,12 +2834,12 @@ special_insn: if (c->modrm_reg == VCPU_SREG_CS || c->modrm_reg > VCPU_SREG_GS) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); @@ -2775,19 +2852,19 @@ special_insn: goto done; break; case 0x90: /* nop / xchg r8,rax */ - if (!(c->rex_prefix & 1)) { /* nop */ - c->dst.type = OP_NONE; + if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { + c->dst.type = OP_NONE; /* nop */ break; } case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = c->dst.type = OP_REG; - c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.type = OP_REG; + c->src.bytes = c->op_bytes; c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; c->src.val = *(c->src.ptr); goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); + emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; @@ -2797,19 +2874,15 @@ special_insn: if (rc != X86EMUL_CONTINUE) goto done; break; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; + case 0xa0 ... 0xa3: /* mov */ case 0xa4 ... 0xa5: /* movs */ goto mov; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); goto cmp; + case 0xa8 ... 0xa9: /* test ax, imm */ + goto test; case 0xaa ... 0xab: /* stos */ c->dst.val = c->regs[VCPU_REGS_RAX]; break; @@ -2855,19 +2928,23 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt); + emulate_push(ctxt, ops); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: /* jmp far */ + case 0xea: { /* jmp far */ + unsigned short sel; jump_far: - if (load_segment_descriptor(ctxt, ops, c->src2.val, - VCPU_SREG_CS)) + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) goto done; - c->eip = c->src.val; + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); break; + } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -2879,20 +2956,20 @@ special_insn: do_io_in: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ break; - case 0xee: /* out al,dx */ - case 0xef: /* out (e/r)ax,dx */ + case 0xee: /* out dx,al */ + case 0xef: /* out dx,(e/r)ax */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_out: c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - kvm_inject_gp(ctxt->vcpu, 0); + emulate_gp(ctxt, 0); goto done; } ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, @@ -2916,18 +2993,20 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); - else { + if (emulator_bad_iopl(ctxt, ops)) { + emulate_gp(ctxt, 0); + goto done; + } else { ctxt->eflags &= ~X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt, ops)) - kvm_inject_gp(ctxt->vcpu, 0); - else { - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); + if (emulator_bad_iopl(ctxt, ops)) { + emulate_gp(ctxt, 0); + goto done; + } else { + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } @@ -2964,11 +3043,12 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, - &c->src); + string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), + VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) - string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); + string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, + &c->dst); if (c->rep_prefix && (c->d & String)) { struct read_cache *rc = &ctxt->decode.io_read; @@ -2981,11 +3061,12 @@ writeback: (rc->end != 0 && rc->end == rc->pos)) ctxt->restart = false; } - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); - ops->set_rflags(ctxt->vcpu, ctxt->eflags); + /* + * reset read cache here in case string instruction is restared + * without decoding + */ + ctxt->decode.mem_read.end = 0; + ctxt->eip = c->eip; done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; @@ -3051,7 +3132,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 5: /* not defined */ - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; case 7: /* invlpg*/ emulate_invlpg(ctxt->vcpu, c->modrm_ea); @@ -3063,7 +3144,7 @@ twobyte_insn: } break; case 0x05: /* syscall */ - rc = emulate_syscall(ctxt); + rc = emulate_syscall(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else @@ -3073,8 +3154,11 @@ twobyte_insn: emulate_clts(ctxt->vcpu); c->dst.type = OP_NONE; break; - case 0x08: /* invd */ case 0x09: /* wbinvd */ + kvm_emulate_wbinvd(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ c->dst.type = OP_NONE; @@ -3084,7 +3168,7 @@ twobyte_insn: case 1: case 5 ... 7: case 9 ... 15: - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); @@ -3093,31 +3177,42 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); goto done; } - emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); + if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + emulate_gp(ctxt, 0); + goto done; + } c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && (c->modrm_reg == 4 || c->modrm_reg == 5)) { - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + emulate_ud(ctxt); + goto done; + } + + if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + ((ctxt->mode == X86EMUL_MODE_PROT64) ? + ~0ULL : ~0U), ctxt->vcpu) < 0) { + /* #UD condition is already handled by the code above */ + emulate_gp(ctxt, 0); goto done; } - emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); + c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + emulate_gp(ctxt, 0); goto done; } rc = X86EMUL_CONTINUE; @@ -3125,8 +3220,8 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { - kvm_inject_gp(ctxt->vcpu, 0); + if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + emulate_gp(ctxt, 0); goto done; } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; @@ -3136,14 +3231,14 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - rc = emulate_sysenter(ctxt); + rc = emulate_sysenter(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else goto writeback; break; case 0x35: /* sysexit */ - rc = emulate_sysexit(ctxt); + rc = emulate_sysexit(ctxt, ops); if (rc != X86EMUL_CONTINUE) goto done; else @@ -3160,7 +3255,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, VCPU_SREG_FS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3179,7 +3274,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, VCPU_SREG_GS); + emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 0150affad25d..0fd6378981f4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,6 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,7 @@ #include <linux/kvm_host.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include "irq.h" #include "i8254.h" @@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); - raw_spin_lock(&ps->inject_lock); - if (atomic_dec_return(&ps->pit_timer.pending) < 0) + int value; + + spin_lock(&ps->inject_lock); + value = atomic_dec_return(&ps->pit_timer.pending); + if (value < 0) + /* spurious acks can be generated if, for example, the + * PIC is being reset. Handle it gracefully here + */ atomic_inc(&ps->pit_timer.pending); + else if (value > 0) + /* in this case, we had multiple outstanding pit interrupts + * that we needed to inject. Reinject + */ + queue_work(ps->pit->wq, &ps->pit->expired); ps->irq_ack = 1; - raw_spin_unlock(&ps->inject_lock); + spin_unlock(&ps->inject_lock); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } -static void destroy_pit_timer(struct kvm_timer *pt) +static void destroy_pit_timer(struct kvm_pit *pit) { - pr_debug("execute del timer!\n"); - hrtimer_cancel(&pt->timer); + hrtimer_cancel(&pit->pit_state.pit_timer.timer); + cancel_work_sync(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; +static void pit_do_work(struct work_struct *work) +{ + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); + struct kvm *kvm = pit->kvm; + struct kvm_vcpu *vcpu; + int i; + struct kvm_kpit_state *ps = &pit->pit_state; + int inject = 0; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + spin_unlock(&ps->inject_lock); + if (inject) { + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); + } +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit *pt = ktimer->kvm->arch.vpit; + + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + queue_work(pt->wq, &pt->expired); + } + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { struct kvm_timer *pt = &ps->pit_timer; @@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); + cancel_work_sync(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; - pt->timer.function = kvm_timer_fn; + pt->timer.function = pit_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) } break; default: - destroy_pit_timer(&ps->pit_timer); + destroy_pit_timer(kvm->arch.vpit); } } @@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); - raw_spin_lock_init(&pit->pit_state.inject_lock); + spin_lock_init(&pit->pit_state.inject_lock); + + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); + if (!pit->wq) { + mutex_unlock(&pit->pit_state.lock); + kfree(pit); + return NULL; + } + INIT_WORK(&pit->expired, pit_do_work); kvm->arch.vpit = pit; pit->kvm = kvm; @@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &kvm->arch.vpit->speaker_dev); kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, @@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); + cancel_work_sync(&kvm->arch.vpit->expired); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); + destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } - -static void __inject_pit_timer_intr(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct kvm *kvm = vcpu->kvm; - struct kvm_kpit_state *ps; - - if (pit) { - int inject = 0; - ps = &pit->pit_state; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - raw_spin_lock(&ps->inject_lock); - if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - raw_spin_unlock(&ps->inject_lock); - if (inject) - __inject_pit_timer_intr(kvm); - } -} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 900d6b0ba7c2..46d08ca0b48f 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -27,7 +27,7 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - raw_spinlock_t inject_lock; + spinlock_t inject_lock; unsigned long irq_ack; struct kvm_irq_ack_notifier irq_ack_notifier; }; @@ -40,6 +40,8 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; + struct workqueue_struct *wq; + struct work_struct expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338f..8d10c063d7f2 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,6 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,8 @@ #include <linux/kvm_host.h> #include "trace.h" +static void pic_irq_request(struct kvm *kvm, int level); + static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { @@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *found = NULL; + int i; s->wakeup_needed = false; raw_spin_unlock(&s->lock); if (wakeup) { - vcpu = s->kvm->bsp_vcpu; - if (vcpu) - kvm_vcpu_kick(vcpu); + kvm_for_each_vcpu(i, vcpu, s->kvm) { + if (kvm_apic_accept_pic_intr(vcpu)) { + found = vcpu; + break; + } + } + + if (!found) + found = s->kvm->bsp_vcpu; + + kvm_vcpu_kick(found); } } @@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s) pic_set_irq1(&s->pics[0], 2, 0); } irq = pic_get_irq(&s->pics[0]); - if (irq >= 0) - s->irq_request(s->irq_request_opaque, 1); - else - s->irq_request(s->irq_request_opaque, 0); + pic_irq_request(s->kvm, irq >= 0); } void kvm_pic_update_irq(struct kvm_pic *s) @@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { int irq; - struct kvm *kvm = s->pics_state->irq_request_opaque; - struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; + struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; u8 irr = s->irr, isr = s->imr; s->last_irr = 0; @@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) /* * deassert a pending interrupt */ - s->pics_state->irq_request(s->pics_state-> - irq_request_opaque, 0); + pic_irq_request(s->pics_state->kvm, 0); s->init_state = 1; s->init4 = val & 1; if (val & 0x02) @@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } } else switch (s->init_state) { - case 0: /* normal mode */ + case 0: { /* normal mode */ + u8 imr_diff = s->imr ^ val, + off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (imr_diff & (1 << irq)) + kvm_fire_mask_notifiers( + s->pics_state->kvm, + SELECT_PIC(irq + off), + irq + off, + !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; + } case 1: s->irq_base = val & 0xf8; s->init_state = 2; @@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this, /* * callback when PIC0 irq status changed */ -static void pic_irq_request(void *opaque, int level) +static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); int irq = pic_get_irq(&s->pics[0]); @@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; - s->irq_request = pic_irq_request; - s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 96dfbb6ad2a9..2095a049835e 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,6 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); - kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413d..ffed06871c5c 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -38,8 +38,6 @@ struct kvm; struct kvm_vcpu; -typedef void irq_request_func(void *opaque, int level); - struct kvm_kpic_state { u8 last_irr; /* edge detection */ u8 irr; /* interrupt request register */ @@ -67,8 +65,6 @@ struct kvm_pic { unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ - irq_request_func *irq_request; - void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index cff851cf5322..6491ac8e755b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { + might_sleep(); /* on svm */ + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); @@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) return kvm_read_cr4_bits(vcpu, ~0UL); } +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1eb7a4ae0c9c..77d8c0f4817d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,6 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel + * Copyright 2009 Red Hat, Inc. and/or its affilates. * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); - ASSERT(!target); + ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == 0) @@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_vcpu *vcpu = apic->vcpu; struct kvm_run *run = vcpu->run; - set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); + kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } @@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (kvm_vcpu_is_bsp(vcpu)) { - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - } + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; return r; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..311f6dad8951 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -32,6 +33,7 @@ #include <linux/compiler.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/page.h> #include <asm/cmpxchg.h> @@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644); #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ @@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -281,13 +281,38 @@ static gfn_t pse36_gfn_delta(u32 gpte) static void __set_spte(u64 *sptep, u64 spte) { + set_64bit(sptep, spte); +} + +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{ #ifdef CONFIG_X86_64 - set_64bit((unsigned long *)sptep, spte); + return xchg(sptep, new_spte); #else - set_64bit((unsigned long long *)sptep, spte); + u64 old_spte; + + do { + old_spte = *sptep; + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + + return old_spte; #endif } +static void update_spte(u64 *sptep, u64 new_spte) +{ + u64 old_spte; + + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || + !is_rmap_spte(*sptep)) + __set_spte(sptep, new_spte); + else { + old_spte = __xchg_spte(sptep, new_spte); + if (old_spte & shadow_accessed_mask) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + } +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -304,10 +329,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, + struct kmem_cache *cache) { while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); + kmem_cache_free(cache, mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -355,10 +381,11 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -379,7 +406,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { - kfree(pc); + kmem_cache_free(pte_chain_cache, pc); } static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -390,7 +417,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { - kfree(rd); + kmem_cache_free(rmap_desc_cache, rd); +} + +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ + if (!sp->role.direct) + return sp->gfns[index]; + + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ + if (sp->role.direct) + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); + else + sp->gfns[index] = gfn; } /* @@ -403,8 +446,8 @@ static int *slot_largepage_idx(gfn_t gfn, { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].write_count; } @@ -414,9 +457,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -430,8 +471,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -447,8 +487,7 @@ static int has_wrprotected_page(struct kvm *kvm, struct kvm_memory_slot *slot; int *largepage_idx; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; @@ -501,7 +540,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) /* * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) @@ -513,8 +551,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].rmap_pde; } @@ -541,9 +579,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) if (!is_rmap_spte(*spte)) return count; - gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -600,19 +637,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - pfn_t pfn; + gfn_t gfn; unsigned long *rmapp; int i; - if (!is_rmap_spte(*spte)) - return; sp = page_header(__pa(spte)); - pfn = spte_to_pfn(*spte); - if (*spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -644,6 +675,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } +static void set_spte_track_bits(u64 *sptep, u64 new_spte) +{ + pfn_t pfn; + u64 old_spte = *sptep; + + if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || + old_spte & shadow_accessed_mask) { + __set_spte(sptep, new_spte); + } else + old_spte = __xchg_spte(sptep, new_spte); + + if (!is_rmap_spte(old_spte)) + return; + pfn = spte_to_pfn(old_spte); + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (is_writable_pte(old_spte)) + kvm_set_pfn_dirty(pfn); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + set_spte_track_bits(sptep, new_spte); + rmap_remove(kvm, sptep); +} + static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; @@ -676,7 +733,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) u64 *spte; int i, write_protected = 0; - gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); @@ -685,7 +741,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - __set_spte(spte, *spte & ~PT_WRITABLE_MASK); + update_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -709,9 +765,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - rmap_remove(kvm, spte); + drop_spte(kvm, spte, + shadow_trap_nonpresent_pte); --kvm->stat.lpages; - __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -731,8 +787,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -754,8 +809,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -763,9 +817,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(spte_to_pfn(*spte)); - __set_spte(spte, new_spte); + new_spte &= ~shadow_accessed_mask; + set_spte_track_bits(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } @@ -799,8 +852,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int sh; + + sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); + idx = ((memslot->base_gfn+gfn_offset) >> sh) - + (memslot->base_gfn >> sh); ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); @@ -863,7 +920,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -894,10 +950,12 @@ static int is_empty_shadow_page(u64 *spt) static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); + hlist_del(&sp->hash_link); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); - kfree(sp); + if (!sp->role.direct) + __free_page(virt_to_page(sp->gfns)); + kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } @@ -907,13 +965,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) + u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); @@ -998,7 +1058,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } - static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; @@ -1008,63 +1067,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + fn(parent_sp, sp->parent_pte); return; } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) + u64 *spte = pte_chain->parent_ptes[i]; + + if (!spte) break; - parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + parent_sp = page_header(__pa(spte)); + fn(parent_sp, spte); } } -static void kvm_mmu_update_unsync_bitmap(u64 *spte) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - unsigned int index; - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - index = spte - sp->spt; - if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) - sp->unsync_children++; - WARN_ON(!sp->unsync_children); + mmu_parent_walk(sp, mark_unsync); } -static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; + unsigned int index; - if (!sp->parent_pte) + index = spte - sp->spt; + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; - - if (!sp->multimapped) { - kvm_mmu_update_unsync_bitmap(sp->parent_pte); + if (sp->unsync_children++) return; - } - - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); - } -} - -static int unsync_walk_fn(struct kvm_mmu_page *sp) -{ - kvm_mmu_update_parents_unsync(sp); - return 1; -} - -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ - mmu_parent_walk(sp, unsync_walk_fn); - kvm_mmu_update_parents_unsync(sp); + kvm_mmu_mark_parents_unsync(sp); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, @@ -1077,7 +1110,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) + struct kvm_mmu_page *sp, bool clear_unsync) { return 1; } @@ -1123,35 +1156,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { + struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) - __clear_bit(i, sp->unsync_child_bitmap); - else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) + goto clear_child_bitmap; + + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + goto clear_child_bitmap; + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } else if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } else + goto clear_child_bitmap; - if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } - } + continue; + +clear_child_bitmap: + __clear_bit(i, sp->unsync_child_bitmap); + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); } - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) - sp->unsync_children = 0; return nr_unsync_leaf; } @@ -1166,26 +1204,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, return __mmu_unsync_walk(sp, pvec); } -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: found role %x\n", - __func__, sp->role.word); - return sp; - } - return NULL; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); @@ -1194,20 +1212,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list); -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +#define for_each_gfn_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ + (sp)->role.invalid) {} else + +/* @sp->gfn should be write-protected at the call site */ +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list, bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); - kvm_unlink_unsync_page(vcpu->kvm, sp); - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + if (clear_unsync) + kvm_unlink_unsync_page(vcpu->kvm, sp); + + if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1215,6 +1249,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + LIST_HEAD(invalid_list); + int ret; + + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); + if (ret) + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + + return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) +{ + return __kvm_sync_page(vcpu, sp, invalid_list, true); +} + +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *s; + struct hlist_node *node; + LIST_HEAD(invalid_list); + bool flush = false; + + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!s->unsync) + continue; + + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + if ((s->role.cr4_pae != !!is_pae(vcpu)) || + (vcpu->arch.mmu.sync_page(vcpu, s, true))) { + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); + continue; + } + kvm_unlink_unsync_page(vcpu->kvm, s); + flush = true; + } + + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + if (flush) + kvm_mmu_flush_tlb(vcpu); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1281,6 +1361,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; + LIST_HEAD(invalid_list); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { @@ -1293,9 +1374,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp); + kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); cond_resched_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1310,11 +1392,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, u64 *parent_pte) { union kvm_mmu_page_role role; - unsigned index; unsigned quadrant; - struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node, *tmp; + struct hlist_node *node; + bool need_sync = false; role = vcpu->arch.mmu.base_role; role.level = level; @@ -1322,40 +1403,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) - if (sp->gfn == gfn) { - if (sp->unsync) - if (kvm_sync_page(vcpu, sp)) - continue; + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { + if (!need_sync && sp->unsync) + need_sync = true; - if (sp->role.word != role.word) - continue; + if (sp->role.word != role.word) + continue; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(sp); - } - trace_kvm_mmu_get_page(sp, false); - return sp; - } + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) + break; + + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_mmu_mark_parents_unsync(sp); + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); + + trace_kvm_mmu_get_page(sp, false); + return sp; + } ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte); + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, bucket); + hlist_add_head(&sp->hash_link, + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + kvm_sync_pages(vcpu, gfn); + account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1402,6 +1488,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + __set_spte(sptep, spte); +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned direct_access) +{ + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + return; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -1422,7 +1549,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } else { if (is_large_pte(ent)) --kvm->stat.lpages; - rmap_remove(kvm, &pt[i]); + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); } } pt[i] = shadow_trap_nonpresent_pte; @@ -1464,7 +1592,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent) + struct kvm_mmu_page *parent, + struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; @@ -1478,7 +1607,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { - kvm_mmu_zap_page(kvm, sp); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } @@ -1488,32 +1617,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { int ret; - trace_kvm_mmu_zap_page(sp); + trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp); + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); - kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); + /* Count self */ + ret++; + list_move(&sp->link, invalid_list); } else { - sp->role.invalid = 1; list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } + + sp->role.invalid = 1; kvm_mmu_reset_last_pte_updated(kvm); return ret; } +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(invalid_list)) + return; + + kvm_flush_remote_tlbs(kvm); + + do { + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_free_page(kvm, sp); + } while (!list_empty(invalid_list)); + +} + /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock @@ -1521,6 +1670,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { int used_pages; + LIST_HEAD(invalid_list); used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = max(0, used_pages); @@ -1538,9 +1688,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_zap_page(kvm, page); - used_pages--; + used_pages -= kvm_mmu_prepare_zap_page(kvm, page, + &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } @@ -1553,47 +1704,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; + LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; -restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + } + kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node, *nn; + struct hlist_node *node; + LIST_HEAD(invalid_list); - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; -restart: - hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: zap %lx %x\n", - __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -1723,47 +1863,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *s; - struct hlist_node *node, *n; - - index = kvm_page_table_hashfn(sp->gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - /* don't unsync if pagetable is shadowed with multiple roles */ - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.direct) - continue; - if (s->role.word != sp->role.word) - return 1; - } trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; kvm_mmu_mark_parents_unsync(sp); - mmu_convert_notrap(sp); - return 0; +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mmu_page *s; + struct hlist_node *node; + + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (s->unsync) + continue; + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + __kvm_unsync_page(vcpu, s); + } } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - struct kvm_mmu_page *shadow; + struct kvm_mmu_page *s; + struct hlist_node *node; + bool need_unsync = false; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!can_unsync) return 1; - if (shadow->unsync) - return 0; - if (can_unsync && oos_shadow) - return kvm_unsync_page(vcpu, shadow); - return 1; + + if (s->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + + if (!need_unsync && !s->unsync) { + if (!oos_shadow) + return 1; + need_unsync = true; + } } + if (need_unsync) + kvm_unsync_pages(vcpu, gfn); return 0; } @@ -1804,13 +1948,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + || (!tdp_enabled && write_fault && !is_write_protection(vcpu) + && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - spte = shadow_trap_nonpresent_pte; - goto set_pte; + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + goto done; } spte |= PT_WRITABLE_MASK; @@ -1841,7 +1986,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - __set_spte(sptep, spte); + if (is_writable_pte(*sptep) && !is_writable_pte(spte)) + kvm_set_pfn_dirty(pfn); + update_spte(sptep, spte); +done: return ret; } @@ -1853,7 +2001,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -1878,8 +2025,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; @@ -1890,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, reset_host_protection)) { if (write_fault) *ptwrite = 1; - kvm_x86_ops->tlb_flush(vcpu); + kvm_mmu_flush_tlb(vcpu); } pgprintk("%s: setting spte %llx\n", __func__, *sptep); @@ -1904,15 +2050,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); - } else { - if (was_writable) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); } + kvm_release_pfn_clean(pfn); if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; @@ -1941,7 +2082,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, } if (*iterator.sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + u64 base_addr = iterator.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); @@ -1960,6 +2104,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return pt_write; } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ + char buf[1]; + void __user *hva; + int r; + + /* Touch the page, so send SIGBUS */ + hva = (void __user *)gfn_to_hva(kvm, gfn); + r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(kvm, gfn); + return 0; + } else if (is_fault_pfn(pfn)) + return -EFAULT; + + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1983,10 +2150,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2009,6 +2174,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -2018,8 +2184,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + if (!sp->root_count && sp->role.invalid) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + } vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -2032,10 +2200,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -2045,7 +2215,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -2073,6 +2243,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2103,6 +2274,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2198,10 +2370,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; @@ -2243,7 +2413,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) @@ -2457,10 +2627,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + /* mmu.free() should set root_hpa = INVALID_PAGE */ vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) @@ -2477,9 +2646,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); @@ -2508,7 +2674,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - rmap_remove(vcpu->kvm, spte); + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); @@ -2529,6 +2695,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } + if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + return; + ++vcpu->kvm->stat.mmu_pte_updated; if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); @@ -2549,11 +2718,15 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, + bool remote_flush, bool local_flush) { - if (need_remote_flush(old, new)) + if (zap_page) + return; + + if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); - else + else if (local_flush) kvm_mmu_flush_tlb(vcpu); } @@ -2603,10 +2776,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; - struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; + struct hlist_node *node; + LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2619,6 +2792,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; int r; int invlpg_counter; + bool remote_flush, local_flush, zap_page; + + zap_page = remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -2674,13 +2850,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pte_updated = NULL; } } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; -restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) - continue; + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; @@ -2697,8 +2869,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_zap_page(vcpu->kvm, sp)) - goto restart; + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -2722,16 +2894,22 @@ restart: if (quadrant != sp->role.quadrant) continue; } + local_flush = true; spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (gentry) + if (gentry && + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + & mask.word)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); + if (!remote_flush && need_remote_flush(entry, *spte)) + remote_flush = true; ++spte; } } + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { @@ -2759,15 +2937,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + int free_pages; + LIST_HEAD(invalid_list); + + free_pages = vcpu->kvm->arch.n_free_mmu_pages; + while (free_pages < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -2795,11 +2979,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - return 0; + /* fall through */ case EMULATE_FAIL: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; return 0; default: BUG(); @@ -2896,7 +3077,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) + if (is_writable_pte(pt[i])) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); @@ -2905,25 +3086,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); - - kvm_flush_remote_tlbs(kvm); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page) + 1; + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) @@ -2936,6 +3118,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages, idx, freed_pages; + LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -2943,12 +3126,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } @@ -3074,7 +3259,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } @@ -3331,9 +3516,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_mmu_page *rev_sp; gfn_t gfn; - if (*sptep & PT_WRITABLE_MASK) { + if (is_writable_pte(*sptep)) { rev_sp = page_header(__pa(sptep)); - gfn = rev_sp->gfns[sptep - rev_sp->spt]; + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) @@ -3347,8 +3532,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - rev_sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; @@ -3381,7 +3565,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) if (!(ent & PT_PRESENT_MASK)) continue; - if (!(ent & PT_WRITABLE_MASK)) + if (!is_writable_pte(ent)) continue; inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } @@ -3409,13 +3593,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->unsync) continue; - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; spte = rmap_next(vcpu->kvm, rmapp, NULL); while (spte) { - if (*spte & PT_WRITABLE_MASK) + if (is_writable_pte(*spte)) printk(KERN_ERR "%s: (%s) shadow page has " "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 42f07b1bfbc9..3aab0f0930ef 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b549..51ef9097960d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker, { pt_element_t pte; gfn_t table_gfn; - unsigned index, pt_access, pte_access; + unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - int rsvd_fault = 0; + bool eperm, present, rsvd_fault; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: + present = true; + eperm = rsvd_fault = false; walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + goto error; + } --walker->level; } #endif @@ -150,37 +155,42 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) - goto not_present; + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + present = false; + break; + } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) - goto not_present; + if (!is_present_gpte(pte)) { + present = false; + break; + } - rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); - if (rsvd_fault) - goto access_error; + if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + rsvd_fault = true; + break; + } if (write_fault && !is_writable_pte(pte)) if (user_fault || is_write_protection(vcpu)) - goto access_error; + eperm = true; if (user_fault && !(pte & PT_USER_MASK)) - goto access_error; + eperm = true; #if PTTYPE == 64 if (fetch_fault && (pte & PT64_NX_MASK)) - goto access_error; + eperm = true; #endif - if (!(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_ACCESSED_MASK)) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -213,15 +223,18 @@ walk: --walker->level; } + if (!present || eperm || rsvd_fault) + goto error; + if (write_fault && !is_dirty_gpte(pte)) { bool ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - mark_page_dirty(vcpu->kvm, table_gfn); ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, pte|PT_DIRTY_MASK); if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; } @@ -229,22 +242,18 @@ walk: walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __func__, (u64)pte, pt_access, pte_access); + __func__, (u64)pte, pte_access, pt_access); return 1; -not_present: +error: walker->error_code = 0; - goto err; - -access_error: - walker->error_code = PFERR_PRESENT_MASK; - -err: + if (present) + walker->error_code |= PFERR_PRESENT_MASK; if (write_fault) walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) + if (fetch_fault && is_nx(vcpu)) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; @@ -252,7 +261,7 @@ err: return 0; } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, +static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { pt_element_t gpte; @@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { if (!is_present_gpte(gpte)) { - if (page->unsync) + if (sp->unsync) new_spte = shadow_trap_nonpresent_pte; else new_spte = shadow_notrap_nonpresent_pte; @@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) return; pfn = vcpu->arch.update_pte.pfn; @@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, * we call mmu_set_spte() with reset_host_protection = true beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true, true); } +static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, + struct guest_walker *gw, int level) +{ + int r; + pt_element_t curr_pte; + + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; - struct kvm_mmu_page *shadow_page; - u64 spte, *sptep = NULL; - int direct; - gfn_t table_gfn; - int r; - int level; - pt_element_t curr_pte; - struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp = NULL; + bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); + int top_level; + unsigned direct_access; + struct kvm_shadow_walk_iterator it; if (!is_present_gpte(gw->ptes[gw->level - 1])) return NULL; - for_each_shadow_entry(vcpu, addr, iterator) { - level = iterator.level; - sptep = iterator.sptep; - if (iterator.level == hlevel) { - mmu_set_spte(vcpu, sptep, access, - gw->pte_access & access, - user_fault, write_fault, - gw->ptes[gw->level-1] & PT_DIRTY_MASK, - ptwrite, level, - gw->gfn, pfn, false, true); - break; - } + direct_access = gw->pt_access & gw->pte_access; + if (!dirty) + direct_access &= ~ACC_WRITE_MASK; - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - continue; + top_level = vcpu->arch.mmu.root_level; + if (top_level == PT32E_ROOT_LEVEL) + top_level = PT32_ROOT_LEVEL; + /* + * Verify that the top-level gpte is still there. Since the page + * is a root page, it is either write protected (and cannot be + * changed from now on) or it is invalid (in which case, we don't + * really care if it changes underneath us after this point). + */ + if (FNAME(gpte_changed)(vcpu, gw, top_level)) + goto out_gpte_changed; - if (is_large_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu->kvm); - } + for (shadow_walk_init(&it, vcpu, addr); + shadow_walk_okay(&it) && it.level > gw->level; + shadow_walk_next(&it)) { + gfn_t table_gfn; - if (level <= gw->level) { - int delta = level - gw->level + 1; - direct = 1; - if (!is_dirty_gpte(gw->ptes[level - delta])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - delta]); - /* advance table_gfn when emulating 1gb pages with 4k */ - if (delta == 0) - table_gfn += PT_INDEX(addr, level); - access &= gw->pte_access; - } else { - direct = 0; - table_gfn = gw->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - direct, access, sptep); - if (!direct) { - r = kvm_read_guest_atomic(vcpu->kvm, - gw->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw->ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); - kvm_release_pfn_clean(pfn); - sptep = NULL; - break; - } + drop_large_spte(vcpu, it.sptep); + + sp = NULL; + if (!is_shadow_present_pte(*it.sptep)) { + table_gfn = gw->table_gfn[it.level - 2]; + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, + false, access, it.sptep); } - spte = __pa(shadow_page->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + /* + * Verify that the gpte in the page we've just write + * protected is still there. + */ + if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) + goto out_gpte_changed; + + if (sp) + link_shadow_page(it.sptep, sp); } - return sptep; + for (; + shadow_walk_okay(&it) && it.level > hlevel; + shadow_walk_next(&it)) { + gfn_t direct_gfn; + + validate_direct_spte(vcpu, it.sptep, direct_access); + + drop_large_spte(vcpu, it.sptep); + + if (is_shadow_present_pte(*it.sptep)) + continue; + + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, + true, direct_access, it.sptep); + link_shadow_page(it.sptep, sp); + } + + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, + user_fault, write_fault, dirty, ptwrite, it.level, + gw->gfn, pfn, false, true); + + return it.sptep; + +out_gpte_changed: + if (sp) + kvm_mmu_put_page(sp, it.sptep); + kvm_release_pfn_clean(pfn); + return NULL; } /* @@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) { - pgprintk("gfn %lx is mmio\n", walker.gfn); - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); + (void)sptep; pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); @@ -464,6 +493,7 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; + struct kvm_mmu_page *sp; gpa_t pte_gpa = -1; int level; u64 *sptep; @@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; + sp = page_header(__pa(sptep)); if (is_last_spte(*sptep, level)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); int offset, shift; + if (!sp->unsync) + break; + shift = PAGE_SHIFT - (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; offset = sp->role.quadrant << shift; @@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu->kvm, sptep); if (is_large_pte(*sptep)) --vcpu->kvm->stat.lpages; + drop_spte(vcpu->kvm, sptep, + shadow_trap_nonpresent_pte); need_flush = 1; - } - __set_spte(sptep, shadow_trap_nonpresent_pte); + } else + __set_spte(sptep, shadow_trap_nonpresent_pte); break; } - if (!is_shadow_present_pte(*sptep)) + if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) break; } @@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, * Using the cached information from sp->gfns is safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. - * - Alias changes zap the entire shadow cache. */ -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { int i, offset, nr_present; bool reset_host_protection; @@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) offset = nr_present = 0; + /* direct kvm_mmu_page can not be unsync. */ + BUG_ON(sp->role.direct); + if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; - gfn_t gfn = sp->gfns[i]; + gfn_t gfn; if (!is_shadow_present_pte(sp->spt[i])) continue; @@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || - !(gpte & PT_ACCESSED_MASK)) { + gfn = gpte_to_gfn(gpte); + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + || gfn != sp->gfns[i] || !is_present_gpte(gpte) + || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; - rmap_remove(vcpu->kvm, &sp->spt[i]); - if (is_present_gpte(gpte)) + if (is_present_gpte(gpte) || !clear_unsync) nonpresent = shadow_trap_nonpresent_pte; else nonpresent = shadow_notrap_nonpresent_pte; - __set_spte(&sp->spt[i], nonpresent); + drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); continue; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..bc5b9b8d4a33 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,6 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -130,7 +131,7 @@ static struct svm_direct_access_msrs { u32 index; /* Index of the MSR */ bool always; /* True if intercept is always on */ } direct_access_msrs[] = { - { .index = MSR_K6_STAR, .always = true }, + { .index = MSR_STAR, .always = true }, { .index = MSR_IA32_SYSENTER_CS, .always = true }, #ifdef CONFIG_X86_64 { .index = MSR_GS_BASE, .always = true }, @@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { + vcpu->arch.efer = efer; if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; - vcpu->arch.efer = efer; } static int is_external_interrupt(u32 info) @@ -383,8 +384,7 @@ static void svm_init_erratum_383(void) int err; u64 val; - /* Only Fam10h is affected */ - if (boot_cpu_data.x86 != 0x10) + if (!cpu_has_amd_erratum(amd_erratum_383)) return; /* Use _safe variants to not break nested virtualization */ @@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void) if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); - kvm_enable_efer_bits(EFER_SVME); + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } for_each_possible_cpu(cpu) { @@ -806,7 +806,7 @@ static void init_vmcb(struct vcpu_svm *svm) * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -903,13 +903,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->asid_generation = 0; init_vmcb(svm); - fx_init(&svm->vcpu); + err = fx_init(&svm->vcpu); + if (err) + goto free_page4; + svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -1488,7 +1493,7 @@ static void svm_handle_mce(struct vcpu_svm *svm) */ pr_err("KVM: Guest triggered AMD Erratum 383\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); return; } @@ -1535,7 +1540,7 @@ static int io_interception(struct vcpu_svm *svm) string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; @@ -1957,7 +1962,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cr3 = hsave->save.cr3; svm->vcpu.arch.cr3 = hsave->save.cr3; } else { - kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); @@ -2080,7 +2085,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; } else - kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); /* Guest paging mode is active - reset mmu */ kvm_mmu_reset_context(&svm->vcpu); @@ -2386,16 +2391,12 @@ static int iret_interception(struct vcpu_svm *svm) static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); - return 1; + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } static int cr8_write_interception(struct vcpu_svm *svm) @@ -2431,7 +2432,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = tsc_offset + native_read_tsc(); break; } - case MSR_K6_STAR: + case MSR_STAR: *data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 @@ -2555,7 +2556,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; } - case MSR_K6_STAR: + case MSR_STAR: svm->vmcb->save.star = data; break; #ifdef CONFIG_X86_64 @@ -2726,6 +2727,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; +void dump_vmcb(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; + + pr_err("VMCB Control Area:\n"); + pr_err("cr_read: %04x\n", control->intercept_cr_read); + pr_err("cr_write: %04x\n", control->intercept_cr_write); + pr_err("dr_read: %04x\n", control->intercept_dr_read); + pr_err("dr_write: %04x\n", control->intercept_dr_write); + pr_err("exceptions: %08x\n", control->intercept_exceptions); + pr_err("intercepts: %016llx\n", control->intercept); + pr_err("pause filter count: %d\n", control->pause_filter_count); + pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); + pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); + pr_err("tsc_offset: %016llx\n", control->tsc_offset); + pr_err("asid: %d\n", control->asid); + pr_err("tlb_ctl: %d\n", control->tlb_ctl); + pr_err("int_ctl: %08x\n", control->int_ctl); + pr_err("int_vector: %08x\n", control->int_vector); + pr_err("int_state: %08x\n", control->int_state); + pr_err("exit_code: %08x\n", control->exit_code); + pr_err("exit_info1: %016llx\n", control->exit_info_1); + pr_err("exit_info2: %016llx\n", control->exit_info_2); + pr_err("exit_int_info: %08x\n", control->exit_int_info); + pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); + pr_err("nested_ctl: %lld\n", control->nested_ctl); + pr_err("nested_cr3: %016llx\n", control->nested_cr3); + pr_err("event_inj: %08x\n", control->event_inj); + pr_err("event_inj_err: %08x\n", control->event_inj_err); + pr_err("lbr_ctl: %lld\n", control->lbr_ctl); + pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("VMCB State Save Area:\n"); + pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); + pr_err("cpl: %d efer: %016llx\n", + save->cpl, save->efer); + pr_err("cr0: %016llx cr2: %016llx\n", + save->cr0, save->cr2); + pr_err("cr3: %016llx cr4: %016llx\n", + save->cr3, save->cr4); + pr_err("dr6: %016llx dr7: %016llx\n", + save->dr6, save->dr7); + pr_err("rip: %016llx rflags: %016llx\n", + save->rip, save->rflags); + pr_err("rsp: %016llx rax: %016llx\n", + save->rsp, save->rax); + pr_err("star: %016llx lstar: %016llx\n", + save->star, save->lstar); + pr_err("cstar: %016llx sfmask: %016llx\n", + save->cstar, save->sfmask); + pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", + save->kernel_gs_base, save->sysenter_cs); + pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", + save->sysenter_esp, save->sysenter_eip); + pr_err("gpat: %016llx dbgctl: %016llx\n", + save->g_pat, save->dbgctl); + pr_err("br_from: %016llx br_to: %016llx\n", + save->br_from, save->br_to); + pr_err("excp_from: %016llx excp_to: %016llx\n", + save->last_excp_from, save->last_excp_to); + +} + static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2770,6 +2864,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); + dump_vmcb(vcpu); return 0; } @@ -2826,9 +2922,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - trace_kvm_inj_virq(irq); - - ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; @@ -2842,6 +2935,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) BUG_ON(!(gif_set(svm))); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + ++vcpu->stat.irq_injections; + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } @@ -3327,6 +3423,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_has_wbinvd_exit(void) +{ + return true; +} + static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3411,6 +3512,8 @@ static struct kvm_x86_ops svm_x86_ops = { .rdtscp_supported = svm_rdtscp_supported, .set_supported_cpuid = svm_set_supported_cpuid, + + .has_wbinvd_exit = svm_has_wbinvd_exit, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index 4ddadb1a5ffe..e16a0dbe74d8 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -1,3 +1,17 @@ +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables machines with Intel VT-x extensions to run virtual + * machines without emulation or binary translation. + * + * timer support + * + * Copyright 2010 Red Hat, Inc. and/or its affilates. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/hrtimer.h> @@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); /* FIXME: this code should not know anything about vcpus */ - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); } if (waitqueue_active(q)) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..49b25eee25ac 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,6 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -36,6 +37,8 @@ #include <asm/vmx.h> #include <asm/virtext.h> #include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h> #include "trace.h" @@ -63,6 +66,9 @@ module_param_named(unrestricted_guest, static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); + #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) #define KVM_GUEST_CR0_MASK \ @@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; @@ -231,14 +240,14 @@ static u64 host_efer; static void ept_save_pdptrs(struct kvm_vcpu *vcpu); /* - * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it + * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif - MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR, + MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) @@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void) return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } +static inline bool cpu_has_vmx_ept_4levels(void) +{ + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void) return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } +static inline bool cpu_has_vmx_invvpid_single(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; +} + +static inline bool cpu_has_vmx_invvpid_global(void) +{ + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; +} + static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void) return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } +static inline bool cpu_has_vmx_wbinvd_exit(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_WBINVD_EXITING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs) vmcs, phys_addr); } +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmcs, phys_addr); +} + static void __vcpu_clear(void *arg) { struct vcpu_vmx *vmx = arg; @@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx) smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); } -static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) { if (vmx->vpid == 0) return; - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); + if (cpu_has_vmx_invvpid_single()) + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + +static inline void vpid_sync_vcpu_global(void) +{ + if (cpu_has_vmx_invvpid_global()) + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); +} + +static inline void vpid_sync_context(struct vcpu_vmx *vmx) +{ + if (cpu_has_vmx_invvpid_single()) + vpid_sync_vcpu_single(vmx); + else + vpid_sync_vcpu_global(); } static inline void ept_sync_global(void) @@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); } #endif + if (current_thread_info()->status & TS_USEDFPU) + clts(); + load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(vmx->vmcs); u64 tsc_this, delta, new_offset; + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - if (vcpu->cpu != cpu) { + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + else if (vcpu->cpu != cpu) vcpu_clear(vmx); - kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); - local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); - } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - u8 error; - per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmx->vmcs, phys_addr); + vmcs_load(vmx->vmcs); } if (vcpu->cpu != cpu) { struct desc_ptr dt; unsigned long sysenter_esp; + kvm_migrate_timers(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); + vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching @@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); + if (!vmm_exclusive) { + __vcpu_clear(to_vmx(vcpu)); + kvm_cpu_vmxoff(); + } } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -1057,10 +1117,10 @@ static void setup_msrs(struct vcpu_vmx *vmx) if (index >= 0 && vmx->rdtscp_enabled) move_msr_up(vmx, index, save_nmsrs++); /* - * MSR_K6_STAR is only needed on long mode guests, and only + * MSR_STAR is only needed on long mode guests, and only * if efer.sce is enabled. */ - index = __find_msr_index(vmx, MSR_K6_STAR); + index = __find_msr_index(vmx, MSR_STAR); if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) move_msr_up(vmx, index, save_nmsrs++); } @@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void) /* locked but not enabled */ } +static void kvm_cpu_vmxon(u64 addr) +{ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&addr), "m"(addr) + : "memory", "cc"); +} + static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); @@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage) wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&phys_addr), "m"(phys_addr) - : "memory", "cc"); - ept_sync_global(); + if (vmm_exclusive) { + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); + } + + store_gdt(&__get_cpu_var(host_gdt)); return 0; } @@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); } static void hardware_disable(void *garbage) { - vmclear_local_vcpus(); - kvm_cpu_vmxoff(); + if (vmm_exclusive) { + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); + } + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -1539,7 +1610,8 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) { + if (!cpu_has_vmx_ept() || + !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; } @@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) gfn_t base_gfn; slots = kvm_memslots(kvm); - base_gfn = kvm->memslots->memslots[0].base_gfn + + base_gfn = slots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } @@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { - vpid_sync_vcpu_all(to_vmx(vcpu)); - if (enable_ept) + vpid_sync_context(to_vmx(vcpu)); + if (enable_ept) { + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ - vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ @@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) static int init_rmode(struct kvm *kvm) { + int idx, ret = 0; + + idx = srcu_read_lock(&kvm->srcu); if (!init_rmode_tss(kvm)) - return 0; + goto exit; if (!init_rmode_identity_map(kvm)) - return 0; - return 1; + goto exit; + + ret = 1; +exit: + srcu_read_unlock(&kvm->srcu, idx); + return ret; } static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 msr; - int ret, idx; + int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - idx = srcu_read_lock(&vcpu->kvm->srcu); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - fx_init(&vmx->vcpu); + ret = fx_init(&vmx->vcpu); + if (ret != 0) + goto out; seg_setup(VCPU_SREG_CS); /* @@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - vpid_sync_vcpu_all(vmx); + vpid_sync_context(vmx); ret = 0; @@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->emulation_required = 0; out: - srcu_read_unlock(&vcpu->kvm->srcu, idx); return ret; } @@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; - else - return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - GUEST_INTR_STATE_NMI); + return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) @@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu) ++vcpu->stat.io_exits; if (string || in) - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } +static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) + kvm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; + int err; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr0(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 3: - kvm_set_cr3(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr3(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 4: - kvm_set_cr4(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr4(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); @@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); - /* TODO: Add support for VT-d/pass-through device */ + kvm_emulate_wbinvd(vcpu); return 1; } -static int handle_apic_access(struct kvm_vcpu *vcpu) +static int handle_xsetbv(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification; - enum emulation_result er; - unsigned long offset; + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - offset = exit_qualification & 0xffful; - - er = emulate_instruction(vcpu, 0, 0, 0); - - if (er != EMULATE_DONE) { - printk(KERN_ERR - "Fail to handle apic access vmexit! Offset is 0x%lx\n", - offset); - return -ENOEXEC; - } + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + skip_emulated_instruction(vcpu); return 1; } +static int handle_apic_access(struct kvm_vcpu *vcpu) +{ + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; +} + static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - ret = 0; - goto out; - } + if (err != EMULATE_DONE) + return 0; if (signal_pending(current)) goto out; @@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, @@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + if (unlikely(vmx->fail)) { vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason @@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - /* - * Loading guest fpu may have cleared host cr0.ts - */ - vmcs_writel(HOST_CR0, read_cr0()); - asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vmx); } +static inline void vmcs_init(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + + vmcs_clear(vmcs); + + if (!vmm_exclusive) + kvm_cpu_vmxoff(); +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->vmcs) goto free_msrs; - vmcs_clear(vmx->vmcs); + vmcs_init(vmx->vmcs); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .rdtscp_supported = vmx_rdtscp_supported, .set_supported_cpuid = vmx_set_supported_cpuid, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..25f19078b321 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,6 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -41,17 +42,19 @@ #include <linux/srcu.h> #include <linux/slab.h> #include <linux/perf_event.h> +#include <linux/uaccess.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS #include "trace.h" #include <asm/debugreg.h> -#include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/mce.h> +#include <asm/i387.h> +#include <asm/xcr.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -62,6 +65,7 @@ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) @@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +u64 __read_mostly host_xcr0; + +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, prev_nr = vcpu->arch.exception.nr; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } class1 = exception_class(prev_nr); @@ -414,121 +425,163 @@ out: return changed; } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | + X86_CR0_CD | X86_CR0_NW; + cr0 |= X86_CR0_ET; #ifdef CONFIG_X86_64 - if (cr0 & 0xffffffff00000000UL) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr0 & 0xffffffff00000000UL) + return 1; #endif cr0 &= ~CR0_RESERVED_BITS; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!is_pae(vcpu)) + return 1; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - kvm_inject_gp(vcpu, 0); - return; - - } + if (cs_l) + return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } - + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; } kvm_x86_ops->set_cr0(vcpu, cr0); - kvm_mmu_reset_context(vcpu); - return; + if ((cr0 ^ old_cr0) & update_bits) + kvm_mmu_reset_context(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + u64 xcr0; - if (cr4 & CR4_RESERVED_BITS) { + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + vcpu->arch.xcr0 = xcr0; + vcpu->guest_xcr0_loaded = 0; + return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); } +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + + if (cr4 & CR4_RESERVED_BITS) + return 1; + + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!(cr4 & X86_CR4_PAE)) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; + + if (cr4 & X86_CR4_VMXE) + return 1; - if (cr4 & X86_CR4_VMXE) { - kvm_inject_gp(vcpu, 0); - return; - } kvm_x86_ops->set_cr4(vcpu, cr4); - vcpu->arch.cr4 = cr4; - kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & pdptr_bits) + kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return; + return 0; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + return 1; } /* * We don't check reserved bits in nonpae mode, because @@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } + return 1; + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) { - kvm_inject_gp(vcpu, 0); - return; - } + if (cr8 & CR8_RESERVED_BITS) + return 1; if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; + return 0; +} + +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (__kvm_set_cr8(vcpu, cr8)) + kvm_inject_gp(vcpu, 0); } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { case 0 ... 3: @@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) + return 1; /* #UD */ /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } + if (val & 0xffffffff00000000ULL) + return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); @@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } + +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + int res; + + res = __kvm_set_dr(vcpu, dr, val); + if (res > 0) + kvm_queue_exception(vcpu, UD_VECTOR); + else if (res < 0) + kvm_inject_gp(vcpu, 0); + + return res; +} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { switch (dr) { case 0 ... 3: *val = vcpu->arch.db[dr]; break; case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ case 6: *val = vcpu->arch.dr6; break; case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) return 1; - } /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; @@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) return 0; } -EXPORT_SYMBOL_GPL(kvm_get_dr); -static inline u32 bit(int bitno) +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { - return 1 << (bitno & 31); + if (_kvm_get_dr(vcpu, dr, val)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + return 0; } +EXPORT_SYMBOL_GPL(kvm_get_dr); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -671,7 +733,7 @@ static u32 msrs_to_save[] = { HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, - MSR_K6_STAR, + MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif @@ -682,10 +744,14 @@ static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, }; static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { + u64 old_efer = vcpu->arch.efer; + if (efer & efer_reserved_bits) return 1; @@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.efer = efer; - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + /* Update reserved bits */ + if ((efer ^ old_efer) & EFER_NX) + kvm_mmu_reset_context(vcpu); + return 0; } @@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v) if (!vcpu->time_page) return 0; - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); + kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); return 1; } @@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, { int i, idx; - vcpu_load(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; srcu_read_unlock(&vcpu->kvm->srcu, idx); - vcpu_put(vcpu); - return i; } @@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: + case KVM_CAP_XSAVE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; + case KVM_CAP_XCRS: + r = cpu_has_xsave; + break; default: r = 0; break; @@ -1717,8 +1785,28 @@ out: return r; } +static void wbinvd_ipi(void *garbage) +{ + wbinvd(); +} + +static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.iommu_domain && + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { + /* Address WBINVD may be executed by guest */ + if (need_emulate_wbinvd(vcpu)) { + if (kvm_x86_ops->has_wbinvd_exit()) + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) + smp_call_function_single(vcpu->cpu, + wbinvd_ipi, NULL, 1); + } + kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { unsigned long khz = cpufreq_quick_get(cpu); @@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_put_guest_fpu(vcpu); kvm_x86_ops->vcpu_put(vcpu); + kvm_put_guest_fpu(vcpu); } static int is_efer_nx(void) @@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; - vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, r = 0; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); + update_cpuid(vcpu); out_free: vfree(cpuid_entries); @@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; - vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); kvm_x86_ops->cpuid_update(vcpu); - vcpu_put(vcpu); + update_cpuid(vcpu); return 0; out: @@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; - vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; - vcpu_put(vcpu); return r; } @@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | @@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xb); + entry->eax = min(entry->eax, (u32)0xd); break; case 1: entry->edx &= kvm_supported_word0_x86_features; @@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 0xd: { + int i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (i = 1; *nent < maxnent; ++i) { + if (entry[i - 1].eax == 0 && i != 2) + break; + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + } + break; + } case KVM_CPUID_SIGNATURE: { char signature[12] = "KVMKVMKVM\0\0"; u32 *sigptr = (u32 *)signature; @@ -2081,9 +2179,7 @@ out: static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); - vcpu_put(vcpu); return 0; } @@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - vcpu_load(vcpu); memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); kvm_apic_post_state_restore(vcpu); update_cr8_intercept(vcpu); - vcpu_put(vcpu); return 0; } @@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -EINVAL; if (irqchip_in_kernel(vcpu->kvm)) return -ENXIO; - vcpu_load(vcpu); kvm_queue_interrupt(vcpu, irq->irq, false); - vcpu_put(vcpu); - return 0; } static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); kvm_inject_nmi(vcpu); - vcpu_put(vcpu); return 0; } @@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; - vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: - vcpu_put(vcpu); return r; } @@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } if (banks[1] & MCI_STATUS_VAL) @@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { - vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; - vcpu_load(vcpu); - vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; - vcpu_put(vcpu); - return 0; } static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, struct kvm_debugregs *dbgregs) { - vcpu_load(vcpu); - memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; - - vcpu_put(vcpu); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - vcpu_load(vcpu); - memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = dbgregs->dr6; vcpu->arch.dr7 = dbgregs->dr7; - vcpu_put(vcpu); + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + sizeof(struct xsave_struct)); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + if (cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, sizeof(struct xsave_struct)); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct i387_fxsave_struct)); + } return 0; } +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; - struct kvm_lapic_state *lapic = NULL; + union { + struct kvm_lapic_state *lapic; + struct kvm_xsave *xsave; + struct kvm_xcrs *xcrs; + void *buffer; + } u; + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; - r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; @@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; if (!vcpu->arch.apic) goto out; - lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; - if (!lapic) + if (!u.lapic) goto out; r = -EFAULT; - if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); if (r) goto out; r = 0; @@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; - vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); break; } + case KVM_GET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!u.xsave) + break; + + r = -EFAULT; + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); + break; + } + case KVM_GET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); + + r = -EFAULT; + if (copy_to_user(argp, u.xcrs, + sizeof(struct kvm_xcrs))) + break; + r = 0; + break; + } + case KVM_SET_XCRS: { + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!u.xcrs) + break; + + r = -EFAULT; + if (copy_from_user(u.xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); + break; + } default: r = -EINVAL; } out: - kfree(lapic); + kfree(u.buffer); return r; } @@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) return kvm->arch.n_alloc_mmu_pages; } -gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (alias->flags & KVM_ALIAS_INVALID) - continue; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) -{ - int i; - struct kvm_mem_alias *alias; - struct kvm_mem_aliases *aliases; - - aliases = kvm_aliases(kvm); - - for (i = 0; i < aliases->naliases; ++i) { - alias = &aliases->aliases[i]; - if (gfn >= alias->base_gfn - && gfn < alias->base_gfn + alias->npages) - return alias->target_gfn + gfn - alias->base_gfn; - } - return gfn; -} - -/* - * Set a new alias region. Aliases map a portion of physical memory into - * another portion. This is useful for memory windows, for example the PC - * VGA region. - */ -static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, - struct kvm_memory_alias *alias) -{ - int r, n; - struct kvm_mem_alias *p; - struct kvm_mem_aliases *aliases, *old_aliases; - - r = -EINVAL; - /* General sanity checks */ - if (alias->memory_size & (PAGE_SIZE - 1)) - goto out; - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) - goto out; - if (alias->slot >= KVM_ALIAS_SLOTS) - goto out; - if (alias->guest_phys_addr + alias->memory_size - < alias->guest_phys_addr) - goto out; - if (alias->target_phys_addr + alias->memory_size - < alias->target_phys_addr) - goto out; - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out; - - mutex_lock(&kvm->slots_lock); - - /* invalidate any gfn reference in case of deletion/shrinking */ - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kvm_mmu_zap_all(kvm); - kfree(old_aliases); - - r = -ENOMEM; - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!aliases) - goto out_unlock; - - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - - p = &aliases->aliases[alias->slot]; - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; - p->npages = alias->memory_size >> PAGE_SHIFT; - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; - p->flags &= ~(KVM_ALIAS_INVALID); - - for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (aliases->aliases[n - 1].npages) - break; - aliases->naliases = n; - - old_aliases = kvm->arch.aliases; - rcu_assign_pointer(kvm->arch.aliases, aliases); - synchronize_srcu_expedited(&kvm->srcu); - kfree(old_aliases); - r = 0; - -out_unlock: - mutex_unlock(&kvm->slots_lock); -out: - return r; -} - static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { int r; @@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot; unsigned long n; unsigned long is_dirty = 0; - unsigned long *dirty_bitmap = NULL; mutex_lock(&kvm->slots_lock); @@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, n = kvm_dirty_bitmap_bytes(memslot); - r = -ENOMEM; - dirty_bitmap = vmalloc(n); - if (!dirty_bitmap) - goto out; - memset(dirty_bitmap, 0, n); - for (i = 0; !is_dirty && i < n/sizeof(long); i++) is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { struct kvm_memslots *slots, *old_slots; + unsigned long *dirty_bitmap; spin_lock(&kvm->mmu_lock); kvm_mmu_slot_remove_write_access(kvm, log->slot); spin_unlock(&kvm->mmu_lock); - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); - if (!slots) - goto out_free; + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) + goto out; + memset(dirty_bitmap, 0, n); + r = -ENOMEM; + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) { + vfree(dirty_bitmap); + goto out; + } memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; @@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { + vfree(dirty_bitmap); + goto out; + } + vfree(dirty_bitmap); + } else { + r = -EFAULT; + if (clear_user(log->dirty_bitmap, n)) + goto out; } r = 0; - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) - r = -EFAULT; -out_free: - vfree(dirty_bitmap); out: mutex_unlock(&kvm->slots_lock); return r; @@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp, union { struct kvm_pit_state ps; struct kvm_pit_state2 ps2; - struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_SET_MEMORY_REGION: { - struct kvm_memory_region kvm_mem; - struct kvm_userspace_memory_region kvm_userspace_mem; - - r = -EFAULT; - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) - goto out; - kvm_userspace_mem.slot = kvm_mem.slot; - kvm_userspace_mem.flags = kvm_mem.flags; - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; - kvm_userspace_mem.memory_size = kvm_mem.memory_size; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); - if (r) - goto out; - break; - } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); if (r) @@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: - r = -EFAULT; - if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) - goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); - if (r) - goto out; - break; case KVM_CREATE_IRQCHIP: { struct kvm_pic *vpic; @@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, } ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, } ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); if (ret < 0) { - r = X86EMUL_UNHANDLEABLE; + r = X86EMUL_IO_NEEDED; goto out; } @@ -3330,10 +3407,10 @@ out: static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3370,11 +3445,12 @@ mmio: trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - return X86EMUL_UNHANDLEABLE; + return X86EMUL_IO_NEEDED; } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, static int emulator_write_emulated_onepage(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; - u32 error_code; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); - if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, error_code); + if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -3420,10 +3494,11 @@ mmio: return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->mmio_is_write = 1; - memcpy(vcpu->mmio_data, val, bytes); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; + vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; + memcpy(vcpu->run->mmio.data, val, bytes); return X86EMUL_CONTINUE; } @@ -3431,6 +3506,7 @@ mmio: int emulator_write_emulated(unsigned long addr, const void *val, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ @@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); + rc = emulator_write_emulated_onepage(addr, val, now, error_code, + vcpu); if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); + return emulator_write_emulated_onepage(addr, val, bytes, error_code, + vcpu); } -EXPORT_SYMBOL_GPL(emulator_write_emulated); #define CMPXCHG_TYPE(t, ptr, old, new) \ (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) @@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, + unsigned int *error_code, struct kvm_vcpu *vcpu) { gpa_t gpa; @@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (is_error_page(page)) { + kvm_release_page_clean(page); + goto emul_write; + } kaddr = kmap_atomic(page, KM_USER0); kaddr += offset_in_page(gpa); @@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, vcpu); + return emulator_write_emulated(addr, new, bytes, error_code, vcpu); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) return X86EMUL_CONTINUE; } -int emulate_clts(struct kvm_vcpu *vcpu) +int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); + if (!need_emulate_wbinvd(vcpu)) + return X86EMUL_CONTINUE; + + if (kvm_x86_ops->has_wbinvd_exit()) { + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + wbinvd_ipi, NULL, 1); + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); + } + wbinvd(); return X86EMUL_CONTINUE; } +EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) +int emulate_clts(struct kvm_vcpu *vcpu) { - return kvm_get_dr(ctxt->vcpu, dr, dest); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + kvm_x86_ops->fpu_activate(vcpu); + return X86EMUL_CONTINUE; } -int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) +int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) { - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - - return kvm_set_dr(ctxt->vcpu, dr, value & mask); + return _kvm_get_dr(vcpu, dr, dest); } -void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) +int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) { - u8 opcodes[4]; - unsigned long rip = kvm_rip_read(vcpu); - unsigned long rip_linear; - - if (!printk_ratelimit()) - return; - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); - - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + return __kvm_set_dr(vcpu, dr, value); } -EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); static u64 mk_cr_64(u64 curr_cr, u32 new_val) { @@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) { + int res = 0; + switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); break; case 2: vcpu->arch.cr2 = val; break; case 3: - kvm_set_cr3(vcpu, val); + res = kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: - kvm_set_cr8(vcpu, val & 0xfUL); + res = __kvm_set_cr8(vcpu, val & 0xfUL); break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + res = -1; } + + return res; } static int emulator_get_cpl(struct kvm_vcpu *vcpu) @@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static unsigned long emulator_get_cached_segment_base(int seg, + struct kvm_vcpu *vcpu) +{ + return get_segment_base(vcpu, seg); +} + static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, struct kvm_vcpu *vcpu) { @@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg, kvm_set_segment(vcpu, &kvm_seg, seg); } -static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -{ - kvm_x86_ops->set_rflags(vcpu, rflags); -} - static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, .write_std = kvm_write_guest_virt_system, @@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = { .set_cached_descriptor = emulator_set_cached_descriptor, .get_segment_selector = emulator_get_segment_selector, .set_segment_selector = emulator_set_segment_selector, + .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, - .set_rflags = emulator_set_rflags, + .get_dr = emulator_get_dr, + .set_dr = emulator_set_dr, + .set_msr = kvm_set_msr, + .get_msr = kvm_get_msr, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = ~0; } +static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); +} + +static void inject_emulated_exception(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + if (ctxt->exception == PF_VECTOR) + kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + else if (ctxt->error_code_valid) + kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); + else + kvm_queue_exception(vcpu, ctxt->exception); +} + +static int handle_emulation_failure(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + kvm_queue_exception(vcpu, UD_VECTOR); + return EMULATE_FAIL; +} + +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + if (tdp_enabled) + return false; + + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + int emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, u16 error_code, int emulation_type) { - int r, shadow_mask; - struct decode_cache *c; - struct kvm_run *run = vcpu->run; + int r; + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, */ cache_all_regs(vcpu); - vcpu->mmio_is_write = 0; - if (!(emulation_type & EMULTYPE_NO_DECODE)) { int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); @@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_ctxt.interruptibility = 0; + vcpu->arch.emulate_ctxt.exception = -1; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ - c = &vcpu->arch.emulate_ctxt.decode; if (emulation_type & EMULTYPE_TRAP_UD) { if (!c->twobyte) return EMULATE_FAIL; @@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_SKIP) + return EMULATE_FAIL; + return handle_emulation_failure(vcpu); } } @@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } + /* this is needed for vmware backdor interface to work since it + changes registers values during IO operation */ + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; - if (r == 0) - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); + if (r) { /* emulation failed */ + if (reexecute_instruction(vcpu, cr2)) + return EMULATE_DONE; - if (vcpu->arch.pio.count) { - if (!vcpu->arch.pio.in) - vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; + return handle_emulation_failure(vcpu); } - if (r || vcpu->mmio_is_write) { - run->exit_reason = KVM_EXIT_MMIO; - run->mmio.phys_addr = vcpu->mmio_phys_addr; - memcpy(run->mmio.data, vcpu->mmio_data, 8); - run->mmio.len = vcpu->mmio_size; - run->mmio.is_write = vcpu->mmio_is_write; + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + if (vcpu->arch.emulate_ctxt.exception >= 0) { + inject_emulated_exception(vcpu); + return EMULATE_DONE; } - if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - goto done; - if (!vcpu->mmio_needed) { - ++vcpu->stat.insn_emulation_fail; - trace_kvm_emulate_insn_failed(vcpu); - kvm_report_emulation_failure(vcpu, "mmio"); - return EMULATE_FAIL; - } + if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; return EMULATE_DO_MMIO; } - if (vcpu->mmio_is_write) { - vcpu->mmio_needed = 0; + if (vcpu->mmio_needed) { + if (vcpu->mmio_is_write) + vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; } -done: - if (vcpu->arch.exception.pending) - vcpu->arch.emulate_ctxt.restart = false; - if (vcpu->arch.emulate_ctxt.restart) goto restart; @@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); + if (cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + return 0; out: @@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, vcpu); + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); } void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) @@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) } } +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; - if (vcpu->requests) - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) - kvm_mmu_unload(vcpu); - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - goto out; - if (vcpu->requests) { - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) + kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) kvm_write_guest_time(vcpu); - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_x86_ops->tlb_flush(vcpu); - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, - &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { vcpu->fpu_active = 0; kvm_x86_ops->fpu_deactivate(vcpu); } } + r = kvm_mmu_reload(vcpu); + if (unlikely(r)) + goto out; + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); if (vcpu->fpu_active) kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); - local_irq_disable(); + atomic_set(&vcpu->guest_mode, 1); + smp_wmb(); - clear_bit(KVM_REQ_KICK, &vcpu->requests); - smp_mb__after_clear_bit(); + local_irq_disable(); - if (vcpu->requests || need_resched() || signal_pending(current)) { - set_bit(KVM_REQ_KICK, &vcpu->requests); + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + || need_resched() || signal_pending(current)) { + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); preempt_enable(); r = 1; @@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - set_bit(KVM_REQ_KICK, &vcpu->requests); + atomic_set(&vcpu->guest_mode, 0); + smp_wmb(); local_irq_enable(); ++vcpu->stat.exits; @@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: @@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; sigset_t sigsaved; - vcpu_load(vcpu); - if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r == EMULATE_DO_MMIO) { + if (r != EMULATE_DONE) { r = 0; goto out; } @@ -4759,14 +4924,11 @@ out: if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); - vcpu_put(vcpu); return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { - vcpu_load(vcpu); - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; - vcpu_put(vcpu); - return 0; } @@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; - vcpu_load(vcpu); - kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); mp_state->mp_state = vcpu->arch.mp_state; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); vcpu->arch.mp_state = mp_state->mp_state; - vcpu_put(vcpu); return 0; } int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l, ret; cache_all_regs(vcpu); @@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, tss_selector, reason, has_error_code, @@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, if (ret) return EMULATE_FAIL; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); return EMULATE_DONE; } @@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits; struct desc_ptr dt; - vcpu_load(vcpu); - dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - vcpu_put(vcpu); - return 0; } @@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; - vcpu_load(vcpu); - if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) - goto unlock_out; + goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); else @@ -5054,34 +5201,12 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; -unlock_out: - vcpu_put(vcpu); +out: return r; } /* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; - vcpu_load(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; tr->usermode = 0; - vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; - - vcpu_load(vcpu); + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); - vcpu_put(vcpu); - return 0; } -void fx_init(struct kvm_vcpu *vcpu) +int fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; + int err; + + err = fpu_alloc(&vcpu->arch.guest_fpu); + if (err) + return err; + + fpu_finit(&vcpu->arch.guest_fpu); /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. + * Ensure guest xcr0 is valid for loading */ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); + vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); + + return 0; } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (vcpu->guest_fpu_loaded) return; + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + unlazy_fpu(current); + fpu_restore_checking(&vcpu->arch.guest_fpu); trace_kvm_fpu(1); } void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + kvm_put_guest_xcr0(vcpu); + if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); + fpu_save_init(&vcpu->arch.guest_fpu); ++vcpu->stat.fpu_reload; - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); trace_kvm_fpu(0); } @@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) vcpu->arch.time_page = NULL; } + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); @@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) + goto fail_free_mce_banks; + return 0; +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); fail_free_lapic: kvm_free_lapic(vcpu); fail_mmu_destroy: @@ -5364,12 +5490,6 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); - kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); - if (!kvm->arch.aliases) { - kfree(kvm); - return ERR_PTR(-ENOMEM); - } - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_sync_events(struct kvm *kvm) { kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); @@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); cleanup_srcu_struct(&kvm->srcu); - kfree(kvm->arch.aliases); kfree(kvm); } @@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); @@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + if (atomic_xchg(&vcpu->guest_mode, 0)) smp_send_reschedule(cpu); put_cpu(); } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f4b54458285b..b7a404722d2b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } -static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) -{ - return rcu_dereference_check(kvm->arch.aliases, - srcu_read_lock_held(&kvm->srcu) - || lockdep_is_held(&kvm->slots_lock)); -} - void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); |