14 files changed, 1059 insertions, 1072 deletions
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c02343594b4d..d3ec292f00f2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 ifeq ($(CONFIG_KVM_TRACE),y)
 common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 endif
-ifeq ($(CONFIG_DMAR),y)
-common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+ifeq ($(CONFIG_IOMMU_API),y)
+common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
 endif
 
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
 
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
+	struct kvm_vcpu *vcpu;
+	int i;
+
 	mutex_lock(&kvm->lock);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
 	mutex_unlock(&kvm->lock);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation is simplified, only
+	 * propagating PIT interrupts to all VCPUs when they have set
+	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+	 * VCPU0, and only if its LVT0 is in EXTINT mode.
+	 */
+	if (kvm->arch.vapics_in_nmi_mode > 0)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (vcpu)
+				kvm_apic_nmi_wd_deliver(vcpu);
+		}
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
  *   Port from Qemu.
  */
 #include <linux/mm.h>
+#include <linux/bitops.h>
 #include "irq.h"
 
 #include <linux/kvm_host.h>
 
+static void pic_lock(struct kvm_pic *s)
+{
+	spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+{
+	struct kvm *kvm = s->kvm;
+	unsigned acks = s->pending_acks;
+	bool wakeup = s->wakeup_needed;
+	struct kvm_vcpu *vcpu;
+
+	s->pending_acks = 0;
+	s->wakeup_needed = false;
+
+	spin_unlock(&s->lock);
+
+	while (acks) {
+		kvm_notify_acked_irq(kvm, __ffs(acks));
+		acks &= acks - 1;
+	}
+
+	if (wakeup) {
+		vcpu = s->kvm->vcpus[0];
+		if (vcpu)
+			kvm_vcpu_kick(vcpu);
+	}
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
+	pic_lock(s);
 	pic_update_irq(s);
+	pic_unlock(s);
 }
 
 void kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
 
+	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
+	pic_unlock(s);
 }
 
 /*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	int irq, irq2, intno;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
+	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
 		pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
+	pic_unlock(s);
 	kvm_notify_acked_irq(kvm, irq);
 
 	return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq, irqbase;
+	int irq, irqbase, n;
 	struct kvm *kvm = s->pics_state->irq_request_opaque;
 	struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
 
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 
 	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
 		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-			if (s->irr & (1 << irq) || s->isr & (1 << irq))
-				kvm_notify_acked_irq(kvm, irq+irqbase);
+			if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
+				n = irq + irqbase;
+				s->pics_state->pending_acks |= 1 << n;
+			}
 	}
 	s->last_irr = 0;
 	s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return;
 	}
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
 		break;
 	}
+	pic_unlock(s);
 }
 
 static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return;
 	}
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
 		break;
 	}
 	*(unsigned char *)val = data;
+	pic_unlock(s);
 }
 
 /*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
 	s->output = level;
 	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
 		s->pics[0].isr_ack &= ~(1 << irq);
-		kvm_vcpu_kick(vcpu);
+		s->wakeup_needed = true;
 	}
 }
 
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
+	spin_lock_init(&s->lock);
+	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
 	s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
 #include <linux/mm_types.h>
 #include <linux/hrtimer.h>
 #include <linux/kvm_host.h>
+#include <linux/spinlock.h>
 
 #include "iodev.h"
 #include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
+	spinlock_t lock;
+	bool wakeup_needed;
+	unsigned pending_acks;
+	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
 	irq_request_func *irq_request;
 	void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
 #include <linux/kvm_host.h>
 #include <asm/msr.h>
 
-#include "svm.h"
+#include <asm/svm.h>
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
 	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
 }
 
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 	case APIC_DM_NMI:
 		kvm_inject_nmi(vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		}
 		break;
 
+	case APIC_DM_EXTINT:
+		/*
+		 * Should only be called by kvm_apic_local_deliver() with LVT0,
+		 * before NMI watchdog was enabled. Already handled by
+		 * kvm_apic_accept_pic_intr().
+		 */
+		break;
+
 	default:
 		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
 		       delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
 					apic->timer.period)));
 }
 
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+	int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+	if (apic_lvt_nmi_mode(lvt0_val)) {
+		if (!nmi_wd_enabled) {
+			apic_debug("Receive NMI setting on APIC_LVT0 "
+				   "for cpu %d\n", apic->vcpu->vcpu_id);
+			apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+		}
+	} else if (nmi_wd_enabled)
+		apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
 static void apic_mmio_write(struct kvm_io_device *this,
 			    gpa_t address, int len, const void *data)
 {
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
 		break;
 
+	case APIC_LVT0:
+		apic_manage_nmi_watchdog(apic, val);
 	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:
-	case APIC_LVT0:
 	case APIC_LVT1:
 	case APIC_LVTERR:
 		/* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+	u32 reg = apic_get_reg(apic, lvt_type);
+	int vector, mode, trig_mode;
+
+	if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+		vector = reg & APIC_VECTOR_MASK;
+		mode = reg & APIC_MODE_MASK;
+		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+		return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+	}
+	return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 {
-	int vector;
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	vector = apic_lvt_vector(apic, APIC_LVTT);
-	return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+	if (apic)
+		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
-		atomic_read(&apic->timer.pending) > 0) {
-		if (__inject_apic_timer_irq(apic))
+	if (apic && atomic_read(&apic->timer.pending) > 0) {
+		if (kvm_apic_local_deliver(apic, APIC_LVTT))
 			atomic_dec(&apic->timer.pending);
 	}
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
  *
  */
 
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/vmx.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
+	shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	int *write_count;
 
-	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	gfn = unalias_gfn(kvm, gfn);
+	write_count = slot_largepage_idx(gfn,
+					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count += 1;
 }
 
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	int *write_count;
 
-	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	gfn = unalias_gfn(kvm, gfn);
+	write_count = slot_largepage_idx(gfn,
+					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count -= 1;
 	WARN_ON(*write_count < 0);
 }
 
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 {
-	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 
+	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot);
 		return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 	return NULL;
 }
 
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	unsigned long *rmapp;
 	u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 
-	if (write_protected)
-		kvm_flush_remote_tlbs(kvm);
+	return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&sp->oos_link);
 	ASSERT(is_empty_shadow_page(sp->spt));
-	sp->slot_bitmap = 0;
+	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
+	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
 	struct kvm_mmu_page *sp = page_header(__pa(spte));
 
 	index = spte - sp->spt;
-	__set_bit(index, sp->unsync_child_bitmap);
-	sp->unsync_children = 1;
+	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+		sp->unsync_children++;
+	WARN_ON(!sp->unsync_children);
 }
 
 static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 
 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	sp->unsync_children = 1;
 	kvm_mmu_update_parents_unsync(sp);
 	return 1;
 }
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+	struct mmu_page_and_offset {
+		struct kvm_mmu_page *sp;
+		unsigned int idx;
+	} page[KVM_PAGE_ARRAY_NR];
+	unsigned int nr;
+};
+
 #define for_each_unsync_children(bitmap, idx)		\
 	for (idx = find_first_bit(bitmap, 512);		\
 	     idx < 512;					\
 	     idx = find_next_bit(bitmap, 512, idx+1))
 
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
-			   struct kvm_unsync_walk *walker)
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+		   int idx)
 {
-	int i, ret;
+	int i;
 
-	if (!sp->unsync_children)
-		return 0;
+	if (sp->unsync)
+		for (i=0; i < pvec->nr; i++)
+			if (pvec->page[i].sp == sp)
+				return 0;
+
+	pvec->page[pvec->nr].sp = sp;
+	pvec->page[pvec->nr].idx = idx;
+	pvec->nr++;
+	return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_mmu_pages *pvec)
+{
+	int i, ret, nr_unsync_leaf = 0;
 
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 		u64 ent = sp->spt[i];
 
-		if (is_shadow_present_pte(ent)) {
+		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
 			struct kvm_mmu_page *child;
 			child = page_header(ent & PT64_BASE_ADDR_MASK);
 
 			if (child->unsync_children) {
-				ret = mmu_unsync_walk(child, walker);
-				if (ret)
+				if (mmu_pages_add(pvec, child, i))
+					return -ENOSPC;
+
+				ret = __mmu_unsync_walk(child, pvec);
+				if (!ret)
+					__clear_bit(i, sp->unsync_child_bitmap);
+				else if (ret > 0)
+					nr_unsync_leaf += ret;
+				else
 					return ret;
-				__clear_bit(i, sp->unsync_child_bitmap);
 			}
 
 			if (child->unsync) {
-				ret = walker->entry(child, walker);
-				__clear_bit(i, sp->unsync_child_bitmap);
-				if (ret)
-					return ret;
+				nr_unsync_leaf++;
+				if (mmu_pages_add(pvec, child, i))
+					return -ENOSPC;
 			}
 		}
 	}
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
 		sp->unsync_children = 0;
 
-	return 0;
+	return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_mmu_pages *pvec)
+{
+	if (!sp->unsync_children)
+		return 0;
+
+	mmu_pages_add(pvec, sp, 0);
+	return __mmu_unsync_walk(sp, pvec);
 }
 
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	list_del(&sp->oos_link);
+	--kvm->stat.mmu_unsync_global;
+}
+
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
 	sp->unsync = 0;
+	if (sp->global)
+		kvm_unlink_unsync_global(kvm, sp);
 	--kvm->stat.mmu_unsync;
 }
 
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		return 1;
 	}
 
-	rmap_write_protect(vcpu->kvm, sp->gfn);
+	if (rmap_write_protect(vcpu->kvm, sp->gfn))
+		kvm_flush_remote_tlbs(vcpu->kvm);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return 0;
 }
 
-struct sync_walker {
-	struct kvm_vcpu *vcpu;
-	struct kvm_unsync_walk walker;
+struct mmu_page_path {
+	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+	unsigned int idx[PT64_ROOT_LEVEL-1];
 };
 
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i)			\
+		for (i = mmu_pages_next(&pvec, &parents, -1),	\
+			sp = pvec.page[i].sp;			\
+			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
+			i = mmu_pages_next(&pvec, &parents, i))
+
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+		   int i)
 {
-	struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
-						     walker);
-	struct kvm_vcpu *vcpu = sync_walk->vcpu;
+	int n;
 
-	kvm_sync_page(vcpu, sp);
-	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+	for (n = i+1; n < pvec->nr; n++) {
+		struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+			parents->idx[0] = pvec->page[n].idx;
+			return n;
+		}
+
+		parents->parent[sp->role.level-2] = sp;
+		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+	}
+
+	return n;
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
-	struct sync_walker walker = {
-		.walker = { .entry = mmu_sync_fn, },
-		.vcpu = vcpu,
-	};
+	struct kvm_mmu_page *sp;
+	unsigned int level = 0;
+
+	do {
+		unsigned int idx = parents->idx[level];
+
+		sp = parents->parent[level];
+		if (!sp)
+			return;
+
+		--sp->unsync_children;
+		WARN_ON((int)sp->unsync_children < 0);
+		__clear_bit(idx, sp->unsync_child_bitmap);
+		level++;
+	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+			       struct mmu_page_path *parents,
+			       struct kvm_mmu_pages *pvec)
+{
+	parents->parent[parent->role.level-1] = NULL;
+	pvec->nr = 0;
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+			      struct kvm_mmu_page *parent)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+	struct mmu_page_path parents;
+	struct kvm_mmu_pages pages;
+
+	kvm_mmu_pages_init(parent, &parents, &pages);
+	while (mmu_unsync_walk(parent, &pages)) {
+		int protected = 0;
 
-	while (mmu_unsync_walk(sp, &walker.walker))
+		for_each_sp(pages, sp, parents, i)
+			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+		if (protected)
+			kvm_flush_remote_tlbs(vcpu->kvm);
+
+		for_each_sp(pages, sp, parents, i) {
+			kvm_sync_page(vcpu, sp);
+			mmu_pages_clear_parents(&parents);
+		}
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_pages_init(parent, &parents, &pages);
+	}
 }
 
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!metaphysical) {
-		rmap_write_protect(vcpu->kvm, gfn);
+		if (rmap_write_protect(vcpu->kvm, gfn))
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
 	if (level == PT32E_ROOT_LEVEL) {
 		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
 		shadow_addr &= PT64_BASE_ADDR_MASK;
+		if (!shadow_addr)
+			return 1;
 		--level;
 	}
 
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-struct zap_walker {
-	struct kvm_unsync_walk walker;
-	struct kvm *kvm;
-	int zapped;
-};
-
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+static int mmu_zap_unsync_children(struct kvm *kvm,
+				   struct kvm_mmu_page *parent)
 {
-	struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
-						     walker);
-	kvm_mmu_zap_page(zap_walk->kvm, sp);
-	zap_walk->zapped = 1;
-	return 0;
-}
+	int i, zapped = 0;
+	struct mmu_page_path parents;
+	struct kvm_mmu_pages pages;
 
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	struct zap_walker walker = {
-		.walker = { .entry = mmu_zap_fn, },
-		.kvm = kvm,
-		.zapped = 0,
-	};
-
-	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
 		return 0;
-	mmu_unsync_walk(sp, &walker.walker);
-	return walker.zapped;
+
+	kvm_mmu_pages_init(parent, &parents, &pages);
+	while (mmu_unsync_walk(parent, &pages)) {
+		struct kvm_mmu_page *sp;
+
+		for_each_sp(pages, sp, parents, i) {
+			kvm_mmu_zap_page(kvm, sp);
+			mmu_pages_clear_parents(&parents);
+		}
+		zapped += pages.nr;
+		kvm_mmu_pages_init(parent, &parents, &pages);
+	}
+
+	return zapped;
 }
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
 	struct kvm_mmu_page *sp = page_header(__pa(pte));
 
-	__set_bit(slot, &sp->slot_bitmap);
+	__set_bit(slot, sp->slot_bitmap);
 }
 
 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+			 u64 start, u64 end)
+{
+	int i;
+	u64 base, mask;
+	u8 prev_match, curr_match;
+	int num_var_ranges = KVM_NR_VAR_MTRR;
+
+	if (!mtrr_state->enabled)
+		return 0xFF;
+
+	/* Make end inclusive end, instead of exclusive */
+	end--;
+
+	/* Look in fixed ranges. Just return the type as per start */
+	if (mtrr_state->have_fixed && (start < 0x100000)) {
+		int idx;
+
+		if (start < 0x80000) {
+			idx = 0;
+			idx += (start >> 16);
+			return mtrr_state->fixed_ranges[idx];
+		} else if (start < 0xC0000) {
+			idx = 1 * 8;
+			idx += ((start - 0x80000) >> 14);
+			return mtrr_state->fixed_ranges[idx];
+		} else if (start < 0x1000000) {
+			idx = 3 * 8;
+			idx += ((start - 0xC0000) >> 12);
+			return mtrr_state->fixed_ranges[idx];
+		}
+	}
+
+	/*
+	 * Look in variable ranges
+	 * Look of multiple ranges matching this address and pick type
+	 * as per MTRR precedence
+	 */
+	if (!(mtrr_state->enabled & 2))
+		return mtrr_state->def_type;
+
+	prev_match = 0xFF;
+	for (i = 0; i < num_var_ranges; ++i) {
+		unsigned short start_state, end_state;
+
+		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+			continue;
+
+		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+		start_state = ((start & mask) == (base & mask));
+		end_state = ((end & mask) == (base & mask));
+		if (start_state != end_state)
+			return 0xFE;
+
+		if ((start & mask) != (base & mask))
+			continue;
+
+		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+		if (prev_match == 0xFF) {
+			prev_match = curr_match;
+			continue;
+		}
+
+		if (prev_match == MTRR_TYPE_UNCACHABLE ||
+		    curr_match == MTRR_TYPE_UNCACHABLE)
+			return MTRR_TYPE_UNCACHABLE;
+
+		if ((prev_match == MTRR_TYPE_WRBACK &&
+		     curr_match == MTRR_TYPE_WRTHROUGH) ||
+		    (prev_match == MTRR_TYPE_WRTHROUGH &&
+		     curr_match == MTRR_TYPE_WRBACK)) {
+			prev_match = MTRR_TYPE_WRTHROUGH;
+			curr_match = MTRR_TYPE_WRTHROUGH;
+		}
+
+		if (prev_match != curr_match)
+			return MTRR_TYPE_UNCACHABLE;
+	}
+
+	if (prev_match != 0xFF)
+		return prev_match;
+
+	return mtrr_state->def_type;
+}
+
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u8 mtrr;
+
+	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
+	if (mtrr == 0xfe || mtrr == 0xff)
+		mtrr = MTRR_TYPE_WRBACK;
+	return mtrr;
+}
+
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
+
+	if (sp->global) {
+		list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+		++vcpu->kvm->stat.mmu_unsync_global;
+	} else
+		kvm_mmu_mark_parents_unsync(vcpu, sp);
+
 	mmu_convert_notrap(sp);
 	return 0;
 }
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    int global, gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
+	u64 mt_mask = shadow_mt_mask;
+	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+	if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+		global = 0;
+	if (!global && sp->global) {
+		sp->global = 0;
+		if (sp->unsync) {
+			kvm_unlink_unsync_global(vcpu->kvm, sp);
+			kvm_mmu_mark_parents_unsync(vcpu, sp);
+		}
+	}
+
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
+	if (mt_mask) {
+		mt_mask = get_memory_type(vcpu, gfn) <<
+			  kvm_x86_ops->get_mt_mask_shift();
+		spte |= mt_mask;
+	}
 
 	spte |= (u64)pfn << PAGE_SHIFT;
 
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 		spte |= PT_WRITABLE_MASK;
 
+		/*
+		 * Optimization: for pte sync, if spte was writable the hash
+		 * lookup is unnecessary (and expensive). Write protection
+		 * is responsibility of mmu_get_page / kvm_sync_page.
+		 * Same reasoning can be applied to dirty page accounting.
+		 */
+		if (!can_unsync && is_writeble_pte(*shadow_pte))
+			goto set_pte;
+
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 int *ptwrite, int largepage, int global,
+			 gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative, true)) {
+		      dirty, largepage, global, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
 		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
 			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, gfn, walk->pfn, false);
+			     walk->largepage, 0, gfn, walk->pfn, false);
 		++vcpu->stat.pf_fixed;
 		return 1;
 	}
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_page *sp, *n;
+
+	list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+		kvm_sync_page(vcpu, sp);
+}
+
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_global(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+		       const u8 *new, int bytes,
+		       bool guest_initiated)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
+	if (guest_initiated) {
+		if (gfn == vcpu->arch.last_pt_write_gfn
+		    && !last_updated_pte_accessed(vcpu)) {
+			++vcpu->arch.last_pt_write_count;
+			if (vcpu->arch.last_pt_write_count >= 3)
+				flooded = 1;
+		} else {
+			vcpu->arch.last_pt_write_gfn = gfn;
+			vcpu->arch.last_pt_write_count = 1;
+			vcpu->arch.last_pte_updated = NULL;
+		}
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	spin_lock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.invlpg(vcpu, gva);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_flush_tlb(vcpu);
 	++vcpu->stat.invlpg;
 }
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		int i;
 		u64 *pt;
 
-		if (!test_bit(slot, &sp->slot_bitmap))
+		if (!test_bit(slot, sp->slot_bitmap))
 			continue;
 
 		pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->role.metaphysical)
 			continue;
 
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 		if (*rmapp)
 			printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
 	int *ptwrite;
 	pfn_t pfn;
 	u64 *sptep;
+	gpa_t pte_gpa;
 };
 
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+		     gpte & PT_DIRTY_MASK, NULL, largepage,
+		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
 		     pfn, true);
 }
 
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
 			     sw->user_fault, sw->write_fault,
 			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
-			     false);
+			     sw->ptwrite, sw->largepage,
+			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+			     gw->gfn, sw->pfn, false);
 		sw->sptep = sptep;
 		return 1;
 	}
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 				      struct kvm_vcpu *vcpu, u64 addr,
 				      u64 *sptep, int level)
 {
+	struct shadow_walker *sw =
+		container_of(_sw, struct shadow_walker, walker);
 
-	if (level == PT_PAGE_TABLE_LEVEL) {
-		if (is_shadow_present_pte(*sptep))
+	/* FIXME: properly handle invlpg on large guest pages */
+	if (level == PT_PAGE_TABLE_LEVEL ||
+	    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+		if (is_shadow_present_pte(*sptep)) {
 			rmap_remove(vcpu->kvm, sptep);
+			if (is_large_pte(*sptep))
+				--vcpu->kvm->stat.lpages;
+		}
 		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 		return 1;
 	}
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+	pt_element_t gpte;
 	struct shadow_walker walker = {
 		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+		.pte_gpa = -1,
 	};
 
+	spin_lock(&vcpu->kvm->mmu_lock);
 	walk_shadow(&walker.walker, vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (walker.pte_gpa == -1)
+		return;
+	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+				  sizeof(pt_element_t)))
+		return;
+	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+		if (mmu_topup_memory_caches(vcpu))
+			return;
+		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+				  sizeof(pt_element_t), 0);
+	}
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gfn,
+			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
 
 #include <asm/desc.h>
 
+#include <asm/virtext.h>
+
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
 static int has_svm(void)
 {
-	uint32_t eax, ebx, ecx, edx;
-
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-		printk(KERN_INFO "has_svm: not amd\n");
-		return 0;
-	}
+	const char *msg;
 
-	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-	if (eax < SVM_CPUID_FUNC) {
-		printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+	if (!cpu_has_svm(&msg)) {
+		printk(KERN_INFO "has_svn: %s\n", msg);
 		return 0;
 	}
 
-	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-		printk(KERN_DEBUG "has_svm: svm not available\n");
-		return 0;
-	}
 	return 1;
 }
 
 static void svm_hardware_disable(void *garbage)
 {
-	uint64_t efer;
-
-	wrmsrl(MSR_VM_HSAVE_PA, 0);
-	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+	cpu_svm_disable();
 }
 
 static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+	/*
+	 * SVM always stores 0 for the 'G' bit in the CS selector in
+	 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+	 * Intel's VMENTRY has a check on the 'G' bit.
+	 */
+	if (seg == VCPU_SREG_CS)
+		var->g = s->limit > 0xfffff;
+
+	/*
+	 * Work around a bug where the busy flag in the tr selector
+	 * isn't exposed
+	 */
+	if (seg == VCPU_SREG_TR)
+		var->type |= 0x2;
+
 	var->unusable = !var->present;
 }
 
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
 	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
+	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
 
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
 #endif
 }
 
+static int svm_get_mt_mask_shift(void)
+{
+	return 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
+	.get_mt_mask_shift = svm_get_mt_mask_shift,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
deleted file mode 100644
index 1b8afa78e869..000000000000
--- a/arch/x86/kvm/svm.h
+++ /dev/null
@@ -1,328 +0,0 @@
-#ifndef __SVM_H
-#define __SVM_H
-
-enum {
-	INTERCEPT_INTR,
-	INTERCEPT_NMI,
-	INTERCEPT_SMI,
-	INTERCEPT_INIT,
-	INTERCEPT_VINTR,
-	INTERCEPT_SELECTIVE_CR0,
-	INTERCEPT_STORE_IDTR,
-	INTERCEPT_STORE_GDTR,
-	INTERCEPT_STORE_LDTR,
-	INTERCEPT_STORE_TR,
-	INTERCEPT_LOAD_IDTR,
-	INTERCEPT_LOAD_GDTR,
-	INTERCEPT_LOAD_LDTR,
-	INTERCEPT_LOAD_TR,
-	INTERCEPT_RDTSC,
-	INTERCEPT_RDPMC,
-	INTERCEPT_PUSHF,
-	INTERCEPT_POPF,
-	INTERCEPT_CPUID,
-	INTERCEPT_RSM,
-	INTERCEPT_IRET,
-	INTERCEPT_INTn,
-	INTERCEPT_INVD,
-	INTERCEPT_PAUSE,
-	INTERCEPT_HLT,
-	INTERCEPT_INVLPG,
-	INTERCEPT_INVLPGA,
-	INTERCEPT_IOIO_PROT,
-	INTERCEPT_MSR_PROT,
-	INTERCEPT_TASK_SWITCH,
-	INTERCEPT_FERR_FREEZE,
-	INTERCEPT_SHUTDOWN,
-	INTERCEPT_VMRUN,
-	INTERCEPT_VMMCALL,
-	INTERCEPT_VMLOAD,
-	INTERCEPT_VMSAVE,
-	INTERCEPT_STGI,
-	INTERCEPT_CLGI,
-	INTERCEPT_SKINIT,
-	INTERCEPT_RDTSCP,
-	INTERCEPT_ICEBP,
-	INTERCEPT_WBINVD,
-	INTERCEPT_MONITOR,
-	INTERCEPT_MWAIT,
-	INTERCEPT_MWAIT_COND,
-};
-
-
-struct __attribute__ ((__packed__)) vmcb_control_area {
-	u16 intercept_cr_read;
-	u16 intercept_cr_write;
-	u16 intercept_dr_read;
-	u16 intercept_dr_write;
-	u32 intercept_exceptions;
-	u64 intercept;
-	u8 reserved_1[44];
-	u64 iopm_base_pa;
-	u64 msrpm_base_pa;
-	u64 tsc_offset;
-	u32 asid;
-	u8 tlb_ctl;
-	u8 reserved_2[3];
-	u32 int_ctl;
-	u32 int_vector;
-	u32 int_state;
-	u8 reserved_3[4];
-	u32 exit_code;
-	u32 exit_code_hi;
-	u64 exit_info_1;
-	u64 exit_info_2;
-	u32 exit_int_info;
-	u32 exit_int_info_err;
-	u64 nested_ctl;
-	u8 reserved_4[16];
-	u32 event_inj;
-	u32 event_inj_err;
-	u64 nested_cr3;
-	u64 lbr_ctl;
-	u8 reserved_5[832];
-};
-
-
-#define TLB_CONTROL_DO_NOTHING 0
-#define TLB_CONTROL_FLUSH_ALL_ASID 1
-
-#define V_TPR_MASK 0x0f
-
-#define V_IRQ_SHIFT 8
-#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
-
-#define V_INTR_PRIO_SHIFT 16
-#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
-
-#define V_IGN_TPR_SHIFT 20
-#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
-
-#define V_INTR_MASKING_SHIFT 24
-#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
-
-#define SVM_INTERRUPT_SHADOW_MASK 1
-
-#define SVM_IOIO_STR_SHIFT 2
-#define SVM_IOIO_REP_SHIFT 3
-#define SVM_IOIO_SIZE_SHIFT 4
-#define SVM_IOIO_ASIZE_SHIFT 7
-
-#define SVM_IOIO_TYPE_MASK 1
-#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
-#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
-#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
-#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
-
-struct __attribute__ ((__packed__)) vmcb_seg {
-	u16 selector;
-	u16 attrib;
-	u32 limit;
-	u64 base;
-};
-
-struct __attribute__ ((__packed__)) vmcb_save_area {
-	struct vmcb_seg es;
-	struct vmcb_seg cs;
-	struct vmcb_seg ss;
-	struct vmcb_seg ds;
-	struct vmcb_seg fs;
-	struct vmcb_seg gs;
-	struct vmcb_seg gdtr;
-	struct vmcb_seg ldtr;
-	struct vmcb_seg idtr;
-	struct vmcb_seg tr;
-	u8 reserved_1[43];
-	u8 cpl;
-	u8 reserved_2[4];
-	u64 efer;
-	u8 reserved_3[112];
-	u64 cr4;
-	u64 cr3;
-	u64 cr0;
-	u64 dr7;
-	u64 dr6;
-	u64 rflags;
-	u64 rip;
-	u8 reserved_4[88];
-	u64 rsp;
-	u8 reserved_5[24];
-	u64 rax;
-	u64 star;
-	u64 lstar;
-	u64 cstar;
-	u64 sfmask;
-	u64 kernel_gs_base;
-	u64 sysenter_cs;
-	u64 sysenter_esp;
-	u64 sysenter_eip;
-	u64 cr2;
-	u8 reserved_6[32];
-	u64 g_pat;
-	u64 dbgctl;
-	u64 br_from;
-	u64 br_to;
-	u64 last_excp_from;
-	u64 last_excp_to;
-};
-
-struct __attribute__ ((__packed__)) vmcb {
-	struct vmcb_control_area control;
-	struct vmcb_save_area save;
-};
-
-#define SVM_CPUID_FEATURE_SHIFT 2
-#define SVM_CPUID_FUNC 0x8000000a
-
-#define MSR_EFER_SVME_MASK (1ULL << 12)
-#define MSR_VM_CR       0xc0010114
-#define MSR_VM_HSAVE_PA 0xc0010117ULL
-
-#define SVM_VM_CR_SVM_DISABLE 4
-
-#define SVM_SELECTOR_S_SHIFT 4
-#define SVM_SELECTOR_DPL_SHIFT 5
-#define SVM_SELECTOR_P_SHIFT 7
-#define SVM_SELECTOR_AVL_SHIFT 8
-#define SVM_SELECTOR_L_SHIFT 9
-#define SVM_SELECTOR_DB_SHIFT 10
-#define SVM_SELECTOR_G_SHIFT 11
-
-#define SVM_SELECTOR_TYPE_MASK (0xf)
-#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
-#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
-#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
-#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
-#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
-#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
-#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
-
-#define SVM_SELECTOR_WRITE_MASK (1 << 1)
-#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
-#define SVM_SELECTOR_CODE_MASK (1 << 3)
-
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
-
-#define SVM_EVTINJ_VEC_MASK 0xff
-
-#define SVM_EVTINJ_TYPE_SHIFT 8
-#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
-#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
-
-#define SVM_EVTINJ_VALID (1 << 31)
-#define SVM_EVTINJ_VALID_ERR (1 << 11)
-
-#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
-
-#define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
-#define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
-#define	SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
-#define	SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
-
-#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
-#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
-
-#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
-#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
-
-#define	SVM_EXIT_READ_CR0 	0x000
-#define	SVM_EXIT_READ_CR3 	0x003
-#define	SVM_EXIT_READ_CR4 	0x004
-#define	SVM_EXIT_READ_CR8 	0x008
-#define	SVM_EXIT_WRITE_CR0 	0x010
-#define	SVM_EXIT_WRITE_CR3 	0x013
-#define	SVM_EXIT_WRITE_CR4 	0x014
-#define	SVM_EXIT_WRITE_CR8 	0x018
-#define	SVM_EXIT_READ_DR0 	0x020
-#define	SVM_EXIT_READ_DR1 	0x021
-#define	SVM_EXIT_READ_DR2 	0x022
-#define	SVM_EXIT_READ_DR3 	0x023
-#define	SVM_EXIT_READ_DR4 	0x024
-#define	SVM_EXIT_READ_DR5 	0x025
-#define	SVM_EXIT_READ_DR6 	0x026
-#define	SVM_EXIT_READ_DR7 	0x027
-#define	SVM_EXIT_WRITE_DR0 	0x030
-#define	SVM_EXIT_WRITE_DR1 	0x031
-#define	SVM_EXIT_WRITE_DR2 	0x032
-#define	SVM_EXIT_WRITE_DR3 	0x033
-#define	SVM_EXIT_WRITE_DR4 	0x034
-#define	SVM_EXIT_WRITE_DR5 	0x035
-#define	SVM_EXIT_WRITE_DR6 	0x036
-#define	SVM_EXIT_WRITE_DR7 	0x037
-#define SVM_EXIT_EXCP_BASE      0x040
-#define SVM_EXIT_INTR		0x060
-#define SVM_EXIT_NMI		0x061
-#define SVM_EXIT_SMI		0x062
-#define SVM_EXIT_INIT		0x063
-#define SVM_EXIT_VINTR		0x064
-#define SVM_EXIT_CR0_SEL_WRITE	0x065
-#define SVM_EXIT_IDTR_READ	0x066
-#define SVM_EXIT_GDTR_READ	0x067
-#define SVM_EXIT_LDTR_READ	0x068
-#define SVM_EXIT_TR_READ	0x069
-#define SVM_EXIT_IDTR_WRITE	0x06a
-#define SVM_EXIT_GDTR_WRITE	0x06b
-#define SVM_EXIT_LDTR_WRITE	0x06c
-#define SVM_EXIT_TR_WRITE	0x06d
-#define SVM_EXIT_RDTSC		0x06e
-#define SVM_EXIT_RDPMC		0x06f
-#define SVM_EXIT_PUSHF		0x070
-#define SVM_EXIT_POPF		0x071
-#define SVM_EXIT_CPUID		0x072
-#define SVM_EXIT_RSM		0x073
-#define SVM_EXIT_IRET		0x074
-#define SVM_EXIT_SWINT		0x075
-#define SVM_EXIT_INVD		0x076
-#define SVM_EXIT_PAUSE		0x077
-#define SVM_EXIT_HLT		0x078
-#define SVM_EXIT_INVLPG		0x079
-#define SVM_EXIT_INVLPGA	0x07a
-#define SVM_EXIT_IOIO		0x07b
-#define SVM_EXIT_MSR		0x07c
-#define SVM_EXIT_TASK_SWITCH	0x07d
-#define SVM_EXIT_FERR_FREEZE	0x07e
-#define SVM_EXIT_SHUTDOWN	0x07f
-#define SVM_EXIT_VMRUN		0x080
-#define SVM_EXIT_VMMCALL	0x081
-#define SVM_EXIT_VMLOAD		0x082
-#define SVM_EXIT_VMSAVE		0x083
-#define SVM_EXIT_STGI		0x084
-#define SVM_EXIT_CLGI		0x085
-#define SVM_EXIT_SKINIT		0x086
-#define SVM_EXIT_RDTSCP		0x087
-#define SVM_EXIT_ICEBP		0x088
-#define SVM_EXIT_WBINVD		0x089
-#define SVM_EXIT_MONITOR	0x08a
-#define SVM_EXIT_MWAIT		0x08b
-#define SVM_EXIT_MWAIT_COND	0x08c
-#define SVM_EXIT_NPF  		0x400
-
-#define SVM_EXIT_ERR		-1
-
-#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
-
-#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
-#define SVM_VMRUN  ".byte 0x0f, 0x01, 0xd8"
-#define SVM_VMSAVE ".byte 0x0f, 0x01, 0xdb"
-#define SVM_CLGI   ".byte 0x0f, 0x01, 0xdd"
-#define SVM_STGI   ".byte 0x0f, 0x01, 0xdc"
-#define SVM_INVLPGA ".byte 0x0f, 0x01, 0xdf"
-
-#endif
-
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
  */
 
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
 
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -90,6 +91,11 @@ struct vcpu_vmx {
 	} rmode;
 	int vpid;
 	bool emulation_required;
+
+	/* Support for vnmi-less CPUs */
+	int soft_vnmi_blocked;
+	ktime_t entry_time;
+	s64 vnmi_blocked_time;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
 	u32 vmentry_ctrl;
 } vmcs_config;
 
-struct vmx_capability {
+static struct vmx_capability {
 	u32 ept;
 	u32 vpid;
 } vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
 
 		break;
+	case MSR_IA32_CR_PAT:
+		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+			vmcs_write64(GUEST_IA32_PAT, data);
+			vcpu->arch.pat = data;
+			break;
+		}
+		/* Otherwise falls through to kvm_set_msr_common */
 	default:
 		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 
 static __init int cpu_has_kvm_support(void)
 {
-	unsigned long ecx = cpuid_ecx(1);
-	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+	return cpu_has_vmx();
 }
 
 static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
-static void hardware_disable(void *garbage)
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
 {
-	vmclear_local_vcpus();
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
+static void hardware_disable(void *garbage)
+{
+	vmclear_local_vcpus();
+	kvm_cpu_vmxoff();
+}
+
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 				      u32 msr, u32 *result)
 {
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-	opt = 0;
+	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
 
-	min = opt = 0;
+	min = 0;
+	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-	u32 host_sysenter_cs;
+	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
+	u64 host_pat;
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
 	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
 
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((u64) msr_high << 32);
+		vmcs_write64(HOST_IA32_PAT, host_pat);
+	}
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((u64) msr_high << 32);
+		/* Write the default value follow host pat */
+		vmcs_write64(GUEST_IA32_PAT, host_pat);
+		/* Keep arch.pat sync with GUEST_IA32_PAT */
+		vmx->vcpu.arch.pat = host_pat;
+	}
+
 	for (i = 0; i < NR_VMX_MSR; ++i) {
 		u32 index = vmx_msr_index[i];
 		u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vmx->vcpu.arch.rmode.active = 0;
 
+	vmx->soft_vnmi_blocked = 0;
+
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(&vmx->vcpu, 0);
 	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
 	return ret;
 }
 
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
+
+	if (!cpu_has_virtual_nmis()) {
+		enable_irq_window(vcpu);
+		return;
+	}
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!cpu_has_virtual_nmis()) {
+		/*
+		 * Tracking the NMI-blocked state in software is built upon
+		 * finding the next open IRQ window. This, in turn, depends on
+		 * well-behaving guests: They have to keep IRQs disabled at
+		 * least as long as the NMI handler runs. Otherwise we may
+		 * cause NMI nesting, maybe breaking the guest. But as this is
+		 * highly unlikely, we can live with the residual risk.
+		 */
+		vmx->soft_vnmi_blocked = 1;
+		vmx->vnmi_blocked_time = 0;
+	}
+
+	++vcpu->stat.nmi_injections;
+	if (vcpu->arch.rmode.active) {
+		vmx->rmode.irq.pending = true;
+		vmx->rmode.irq.vector = NMI_VECTOR;
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+			     INTR_INFO_VALID_MASK);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		return;
+	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+	vcpu->arch.nmi_window_open =
+		!(guest_intr & (GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS |
+				GUEST_INTR_STATE_NMI));
+	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+		vcpu->arch.nmi_window_open = 0;
+
+	vcpu->arch.interrupt_window_open =
+		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		 !(guest_intr & (GUEST_INTR_STATE_STI |
+				 GUEST_INTR_STATE_MOV_SS)));
+}
+
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
 	int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	kvm_queue_interrupt(vcpu, irq);
 }
 
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
-	u32 cpu_based_vm_exec_control;
-
-	vcpu->arch.interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+	vmx_update_window_states(vcpu);
 
-	if (vcpu->arch.interrupt_window_open &&
-	    vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-		kvm_do_inject_irq(vcpu);
+	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+		if (vcpu->arch.interrupt.pending) {
+			enable_nmi_window(vcpu);
+		} else if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+		} else {
+			enable_nmi_window(vcpu);
+			return;
+		}
+	}
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending)
+			enable_nmi_window(vcpu);
+		else if (vcpu->arch.irq_summary
+			 || kvm_run->request_interrupt_window)
+			enable_irq_window(vcpu);
+		return;
+	}
 
-	if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	if (vcpu->arch.interrupt_window_open) {
+		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+			kvm_do_inject_irq(vcpu);
 
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+		if (vcpu->arch.interrupt.pending)
+			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	}
 	if (!vcpu->arch.interrupt_window_open &&
 	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-		/*
-		 * Interrupts blocked.  Wait for unblock.
-		 */
-		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	else
-		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		enable_irq_window(vcpu);
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
 	struct kvm_userspace_memory_region tss_mem = {
-		.slot = 8,
+		.slot = TSS_PRIVATE_MEMSLOT,
 		.guest_phys_addr = addr,
 		.memory_size = PAGE_SIZE * 3,
 		.flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
 	}
 
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	rep = (exit_qualification & 32) != 0;
 	port = exit_qualification >> 16;
 
+	skip_emulated_instruction(vcpu);
 	return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
 
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
 	KVMTRACE_0D(PEND_INTR, vcpu, handler);
+	++vcpu->stat.irq_window_exits;
 
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	if (kvm_run->request_interrupt_window &&
 	    !vcpu->arch.irq_summary) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		++vcpu->stat.irq_window_exits;
 		return 0;
 	}
 	return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
 	u16 tss_selector;
 	int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	reason = (u32)exit_qualification >> 30;
+	if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+	    (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+	    (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+	    == INTR_TYPE_NMI_INTR) {
+		vcpu->arch.nmi_injected = false;
+		if (cpu_has_virtual_nmis())
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+	}
 	tss_selector = exit_qualification;
 
 	return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	while (!guest_state_valid(vcpu)) {
 		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
 
-		switch (err) {
-			case EMULATE_DONE:
-				break;
-			case EMULATE_DO_MMIO:
-				kvm_report_emulation_failure(vcpu, "mmio");
-				/* TODO: Handle MMIO */
-				return;
-			default:
-				kvm_report_emulation_failure(vcpu, "emulation failure");
-				return;
+		if (err == EMULATE_DO_MMIO)
+			break;
+
+		if (err != EMULATE_DONE) {
+			kvm_report_emulation_failure(vcpu, "emulation failure");
+			return;
 		}
 
 		if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	local_irq_disable();
 	preempt_disable();
 
-	/* Guest state should be valid now, no more emulation should be needed */
-	vmx->emulation_required = 0;
+	/* Guest state should be valid now except if we need to
+	 * emulate an MMIO */
+	if (guest_state_valid(vcpu))
+		vmx->emulation_required = 0;
 }
 
 /*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
 		    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
+	/* If we need to emulate an MMIO from handle_invalid_guest_state
+	 * we just return 0 */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return 0;
+
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
 	if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-			exit_reason != EXIT_REASON_EPT_VIOLATION))
-		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-		       "exit reason is 0x%x\n", __func__, exit_reason);
+			exit_reason != EXIT_REASON_EPT_VIOLATION &&
+			exit_reason != EXIT_REASON_TASK_SWITCH))
+		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+		       "(0x%x) and exit reason is 0x%x\n",
+		       __func__, vectoring_info, exit_reason);
+
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+		if (vcpu->arch.interrupt_window_open) {
+			vmx->soft_vnmi_blocked = 0;
+			vcpu->arch.nmi_window_open = 1;
+		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
+			   vcpu->arch.nmi_pending) {
+			/*
+			 * This CPU don't support us in finding the end of an
+			 * NMI-blocked window if the guest runs with IRQs
+			 * disabled. So we pull the trigger after 1 s of
+			 * futile waiting, but inform the user about this.
+			 */
+			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+			       "state on VCPU %d after 1 s timeout\n",
+			       __func__, vcpu->vcpu_id);
+			vmx->soft_vnmi_blocked = 0;
+			vmx->vcpu.arch.nmi_window_open = 1;
+		}
+	}
+
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	if (!cpu_has_virtual_nmis())
-		return;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	return !(guest_intr & (GUEST_INTR_STATE_NMI |
-			       GUEST_INTR_STATE_MOV_SS |
-			       GUEST_INTR_STATE_STI));
-}
-
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
-			       GUEST_INTR_STATE_STI)) &&
-		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.nmi_pending)
-		enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu))
-		enable_irq_window(vcpu);
-}
-
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		if (unblock_nmi && vector != DF_VECTOR)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
-	}
+	} else if (unlikely(vmx->soft_vnmi_blocked))
+		vmx->vnmi_blocked_time +=
+			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
 	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	update_tpr_threshold(vcpu);
 
-	if (cpu_has_virtual_nmis()) {
-		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-			if (vcpu->arch.interrupt.pending) {
-				enable_nmi_window(vcpu);
-			} else if (vmx_nmi_enabled(vcpu)) {
-				vcpu->arch.nmi_pending = false;
-				vcpu->arch.nmi_injected = true;
-			} else {
-				enable_intr_window(vcpu);
-				return;
-			}
-		}
-		if (vcpu->arch.nmi_injected) {
-			vmx_inject_nmi(vcpu);
-			enable_intr_window(vcpu);
+	vmx_update_window_states(vcpu);
+
+	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+		if (vcpu->arch.interrupt.pending) {
+			enable_nmi_window(vcpu);
+		} else if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+		} else {
+			enable_nmi_window(vcpu);
 			return;
 		}
 	}
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending)
+			enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu))
+			enable_irq_window(vcpu);
+		return;
+	}
 	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-		if (vmx_irq_enabled(vcpu))
+		if (vcpu->arch.interrupt_window_open)
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 		else
 			enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.interrupt.pending) {
 		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
 		kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+		if (kvm_cpu_has_interrupt(vcpu))
+			enable_irq_window(vcpu);
 	}
 }
 
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	/* Record the guest's net vcpu time for enforced NMI injections. */
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+		vmx->entry_time = ktime_get();
+
 	/* Handle invalid guest state instead of entering VMX */
 	if (vmx->emulation_required && emulate_invalid_guest_state) {
 		handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
-	vcpu->arch.interrupt_window_open =
-		(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-		 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
+	vmx_update_window_states(vcpu);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
 	    (intr_info & INTR_INFO_VALID_MASK)) {
 		KVMTRACE_0D(NMI, vcpu, handler);
 		asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
 	return VMX_EPT_DEFAULT_GAW + 1;
 }
 
+static int vmx_get_mt_mask_shift(void)
+{
+	return VMX_EPT_MT_EPTE_SHIFT;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
+	.get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 
 static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK |
-			VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
 			VMX_EPT_IGMT_BIT);
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-				VMX_EPT_EXECUTABLE_MASK);
+				VMX_EPT_EXECUTABLE_MASK,
+				VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
deleted file mode 100644
index ec5edc339da6..000000000000
--- a/arch/x86/kvm/vmx.h
+++ /dev/null
@@ -1,359 +0,0 @@
-#ifndef VMX_H
-#define VMX_H
-
-/*
- * vmx.h: VMX Architecture related definitions
- * Copyright (c) 2004, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * A few random additions are:
- * Copyright (C) 2006 Qumranet
- *    Avi Kivity <avi@qumranet.com>
- *    Yaniv Kamay <yaniv@qumranet.com>
- *
- */
-
-/*
- * Definitions of Primary Processor-Based VM-Execution Controls.
- */
-#define CPU_BASED_VIRTUAL_INTR_PENDING          0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING             0x00000008
-#define CPU_BASED_HLT_EXITING                   0x00000080
-#define CPU_BASED_INVLPG_EXITING                0x00000200
-#define CPU_BASED_MWAIT_EXITING                 0x00000400
-#define CPU_BASED_RDPMC_EXITING                 0x00000800
-#define CPU_BASED_RDTSC_EXITING                 0x00001000
-#define CPU_BASED_CR3_LOAD_EXITING		0x00008000
-#define CPU_BASED_CR3_STORE_EXITING		0x00010000
-#define CPU_BASED_CR8_LOAD_EXITING              0x00080000
-#define CPU_BASED_CR8_STORE_EXITING             0x00100000
-#define CPU_BASED_TPR_SHADOW                    0x00200000
-#define CPU_BASED_VIRTUAL_NMI_PENDING		0x00400000
-#define CPU_BASED_MOV_DR_EXITING                0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING             0x01000000
-#define CPU_BASED_USE_IO_BITMAPS                0x02000000
-#define CPU_BASED_USE_MSR_BITMAPS               0x10000000
-#define CPU_BASED_MONITOR_EXITING               0x20000000
-#define CPU_BASED_PAUSE_EXITING                 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS   0x80000000
-/*
- * Definitions of Secondary Processor-Based VM-Execution Controls.
- */
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
-#define SECONDARY_EXEC_ENABLE_VPID              0x00000020
-#define SECONDARY_EXEC_WBINVD_EXITING		0x00000040
-
-
-#define PIN_BASED_EXT_INTR_MASK                 0x00000001
-#define PIN_BASED_NMI_EXITING                   0x00000008
-#define PIN_BASED_VIRTUAL_NMIS                  0x00000020
-
-#define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
-#define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
-
-#define VM_ENTRY_IA32E_MODE                     0x00000200
-#define VM_ENTRY_SMM                            0x00000400
-#define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
-
-/* VMCS Encodings */
-enum vmcs_field {
-	VIRTUAL_PROCESSOR_ID            = 0x00000000,
-	GUEST_ES_SELECTOR               = 0x00000800,
-	GUEST_CS_SELECTOR               = 0x00000802,
-	GUEST_SS_SELECTOR               = 0x00000804,
-	GUEST_DS_SELECTOR               = 0x00000806,
-	GUEST_FS_SELECTOR               = 0x00000808,
-	GUEST_GS_SELECTOR               = 0x0000080a,
-	GUEST_LDTR_SELECTOR             = 0x0000080c,
-	GUEST_TR_SELECTOR               = 0x0000080e,
-	HOST_ES_SELECTOR                = 0x00000c00,
-	HOST_CS_SELECTOR                = 0x00000c02,
-	HOST_SS_SELECTOR                = 0x00000c04,
-	HOST_DS_SELECTOR                = 0x00000c06,
-	HOST_FS_SELECTOR                = 0x00000c08,
-	HOST_GS_SELECTOR                = 0x00000c0a,
-	HOST_TR_SELECTOR                = 0x00000c0c,
-	IO_BITMAP_A                     = 0x00002000,
-	IO_BITMAP_A_HIGH                = 0x00002001,
-	IO_BITMAP_B                     = 0x00002002,
-	IO_BITMAP_B_HIGH                = 0x00002003,
-	MSR_BITMAP                      = 0x00002004,
-	MSR_BITMAP_HIGH                 = 0x00002005,
-	VM_EXIT_MSR_STORE_ADDR          = 0x00002006,
-	VM_EXIT_MSR_STORE_ADDR_HIGH     = 0x00002007,
-	VM_EXIT_MSR_LOAD_ADDR           = 0x00002008,
-	VM_EXIT_MSR_LOAD_ADDR_HIGH      = 0x00002009,
-	VM_ENTRY_MSR_LOAD_ADDR          = 0x0000200a,
-	VM_ENTRY_MSR_LOAD_ADDR_HIGH     = 0x0000200b,
-	TSC_OFFSET                      = 0x00002010,
-	TSC_OFFSET_HIGH                 = 0x00002011,
-	VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
-	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
-	APIC_ACCESS_ADDR		= 0x00002014,
-	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
-	EPT_POINTER                     = 0x0000201a,
-	EPT_POINTER_HIGH                = 0x0000201b,
-	GUEST_PHYSICAL_ADDRESS          = 0x00002400,
-	GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
-	VMCS_LINK_POINTER               = 0x00002800,
-	VMCS_LINK_POINTER_HIGH          = 0x00002801,
-	GUEST_IA32_DEBUGCTL             = 0x00002802,
-	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
-	GUEST_PDPTR0                    = 0x0000280a,
-	GUEST_PDPTR0_HIGH               = 0x0000280b,
-	GUEST_PDPTR1                    = 0x0000280c,
-	GUEST_PDPTR1_HIGH               = 0x0000280d,
-	GUEST_PDPTR2                    = 0x0000280e,
-	GUEST_PDPTR2_HIGH               = 0x0000280f,
-	GUEST_PDPTR3                    = 0x00002810,
-	GUEST_PDPTR3_HIGH               = 0x00002811,
-	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
-	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
-	EXCEPTION_BITMAP                = 0x00004004,
-	PAGE_FAULT_ERROR_CODE_MASK      = 0x00004006,
-	PAGE_FAULT_ERROR_CODE_MATCH     = 0x00004008,
-	CR3_TARGET_COUNT                = 0x0000400a,
-	VM_EXIT_CONTROLS                = 0x0000400c,
-	VM_EXIT_MSR_STORE_COUNT         = 0x0000400e,
-	VM_EXIT_MSR_LOAD_COUNT          = 0x00004010,
-	VM_ENTRY_CONTROLS               = 0x00004012,
-	VM_ENTRY_MSR_LOAD_COUNT         = 0x00004014,
-	VM_ENTRY_INTR_INFO_FIELD        = 0x00004016,
-	VM_ENTRY_EXCEPTION_ERROR_CODE   = 0x00004018,
-	VM_ENTRY_INSTRUCTION_LEN        = 0x0000401a,
-	TPR_THRESHOLD                   = 0x0000401c,
-	SECONDARY_VM_EXEC_CONTROL       = 0x0000401e,
-	VM_INSTRUCTION_ERROR            = 0x00004400,
-	VM_EXIT_REASON                  = 0x00004402,
-	VM_EXIT_INTR_INFO               = 0x00004404,
-	VM_EXIT_INTR_ERROR_CODE         = 0x00004406,
-	IDT_VECTORING_INFO_FIELD        = 0x00004408,
-	IDT_VECTORING_ERROR_CODE        = 0x0000440a,
-	VM_EXIT_INSTRUCTION_LEN         = 0x0000440c,
-	VMX_INSTRUCTION_INFO            = 0x0000440e,
-	GUEST_ES_LIMIT                  = 0x00004800,
-	GUEST_CS_LIMIT                  = 0x00004802,
-	GUEST_SS_LIMIT                  = 0x00004804,
-	GUEST_DS_LIMIT                  = 0x00004806,
-	GUEST_FS_LIMIT                  = 0x00004808,
-	GUEST_GS_LIMIT                  = 0x0000480a,
-	GUEST_LDTR_LIMIT                = 0x0000480c,
-	GUEST_TR_LIMIT                  = 0x0000480e,
-	GUEST_GDTR_LIMIT                = 0x00004810,
-	GUEST_IDTR_LIMIT                = 0x00004812,
-	GUEST_ES_AR_BYTES               = 0x00004814,
-	GUEST_CS_AR_BYTES               = 0x00004816,
-	GUEST_SS_AR_BYTES               = 0x00004818,
-	GUEST_DS_AR_BYTES               = 0x0000481a,
-	GUEST_FS_AR_BYTES               = 0x0000481c,
-	GUEST_GS_AR_BYTES               = 0x0000481e,
-	GUEST_LDTR_AR_BYTES             = 0x00004820,
-	GUEST_TR_AR_BYTES               = 0x00004822,
-	GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
-	GUEST_ACTIVITY_STATE            = 0X00004826,
-	GUEST_SYSENTER_CS               = 0x0000482A,
-	HOST_IA32_SYSENTER_CS           = 0x00004c00,
-	CR0_GUEST_HOST_MASK             = 0x00006000,
-	CR4_GUEST_HOST_MASK             = 0x00006002,
-	CR0_READ_SHADOW                 = 0x00006004,
-	CR4_READ_SHADOW                 = 0x00006006,
-	CR3_TARGET_VALUE0               = 0x00006008,
-	CR3_TARGET_VALUE1               = 0x0000600a,
-	CR3_TARGET_VALUE2               = 0x0000600c,
-	CR3_TARGET_VALUE3               = 0x0000600e,
-	EXIT_QUALIFICATION              = 0x00006400,
-	GUEST_LINEAR_ADDRESS            = 0x0000640a,
-	GUEST_CR0                       = 0x00006800,
-	GUEST_CR3                       = 0x00006802,
-	GUEST_CR4                       = 0x00006804,
-	GUEST_ES_BASE                   = 0x00006806,
-	GUEST_CS_BASE                   = 0x00006808,
-	GUEST_SS_BASE                   = 0x0000680a,
-	GUEST_DS_BASE                   = 0x0000680c,
-	GUEST_FS_BASE                   = 0x0000680e,
-	GUEST_GS_BASE                   = 0x00006810,
-	GUEST_LDTR_BASE                 = 0x00006812,
-	GUEST_TR_BASE                   = 0x00006814,
-	GUEST_GDTR_BASE                 = 0x00006816,
-	GUEST_IDTR_BASE                 = 0x00006818,
-	GUEST_DR7                       = 0x0000681a,
-	GUEST_RSP                       = 0x0000681c,
-	GUEST_RIP                       = 0x0000681e,
-	GUEST_RFLAGS                    = 0x00006820,
-	GUEST_PENDING_DBG_EXCEPTIONS    = 0x00006822,
-	GUEST_SYSENTER_ESP              = 0x00006824,
-	GUEST_SYSENTER_EIP              = 0x00006826,
-	HOST_CR0                        = 0x00006c00,
-	HOST_CR3                        = 0x00006c02,
-	HOST_CR4                        = 0x00006c04,
-	HOST_FS_BASE                    = 0x00006c06,
-	HOST_GS_BASE                    = 0x00006c08,
-	HOST_TR_BASE                    = 0x00006c0a,
-	HOST_GDTR_BASE                  = 0x00006c0c,
-	HOST_IDTR_BASE                  = 0x00006c0e,
-	HOST_IA32_SYSENTER_ESP          = 0x00006c10,
-	HOST_IA32_SYSENTER_EIP          = 0x00006c12,
-	HOST_RSP                        = 0x00006c14,
-	HOST_RIP                        = 0x00006c16,
-};
-
-#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000
-
-#define EXIT_REASON_EXCEPTION_NMI       0
-#define EXIT_REASON_EXTERNAL_INTERRUPT  1
-#define EXIT_REASON_TRIPLE_FAULT        2
-
-#define EXIT_REASON_PENDING_INTERRUPT   7
-#define EXIT_REASON_NMI_WINDOW		8
-#define EXIT_REASON_TASK_SWITCH         9
-#define EXIT_REASON_CPUID               10
-#define EXIT_REASON_HLT                 12
-#define EXIT_REASON_INVLPG              14
-#define EXIT_REASON_RDPMC               15
-#define EXIT_REASON_RDTSC               16
-#define EXIT_REASON_VMCALL              18
-#define EXIT_REASON_VMCLEAR             19
-#define EXIT_REASON_VMLAUNCH            20
-#define EXIT_REASON_VMPTRLD             21
-#define EXIT_REASON_VMPTRST             22
-#define EXIT_REASON_VMREAD              23
-#define EXIT_REASON_VMRESUME            24
-#define EXIT_REASON_VMWRITE             25
-#define EXIT_REASON_VMOFF               26
-#define EXIT_REASON_VMON                27
-#define EXIT_REASON_CR_ACCESS           28
-#define EXIT_REASON_DR_ACCESS           29
-#define EXIT_REASON_IO_INSTRUCTION      30
-#define EXIT_REASON_MSR_READ            31
-#define EXIT_REASON_MSR_WRITE           32
-#define EXIT_REASON_MWAIT_INSTRUCTION   36
-#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
-#define EXIT_REASON_APIC_ACCESS         44
-#define EXIT_REASON_EPT_VIOLATION       48
-#define EXIT_REASON_EPT_MISCONFIG       49
-#define EXIT_REASON_WBINVD		54
-
-/*
- * Interruption-information format
- */
-#define INTR_INFO_VECTOR_MASK           0xff            /* 7:0 */
-#define INTR_INFO_INTR_TYPE_MASK        0x700           /* 10:8 */
-#define INTR_INFO_DELIVER_CODE_MASK     0x800           /* 11 */
-#define INTR_INFO_UNBLOCK_NMI		0x1000		/* 12 */
-#define INTR_INFO_VALID_MASK            0x80000000      /* 31 */
-#define INTR_INFO_RESVD_BITS_MASK       0x7ffff000
-
-#define VECTORING_INFO_VECTOR_MASK           	INTR_INFO_VECTOR_MASK
-#define VECTORING_INFO_TYPE_MASK        	INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIVER_CODE_MASK    	INTR_INFO_DELIVER_CODE_MASK
-#define VECTORING_INFO_VALID_MASK       	INTR_INFO_VALID_MASK
-
-#define INTR_TYPE_EXT_INTR              (0 << 8) /* external interrupt */
-#define INTR_TYPE_NMI_INTR		(2 << 8) /* NMI */
-#define INTR_TYPE_EXCEPTION             (3 << 8) /* processor exception */
-#define INTR_TYPE_SOFT_INTR             (4 << 8) /* software interrupt */
-
-/* GUEST_INTERRUPTIBILITY_INFO flags. */
-#define GUEST_INTR_STATE_STI		0x00000001
-#define GUEST_INTR_STATE_MOV_SS		0x00000002
-#define GUEST_INTR_STATE_SMI		0x00000004
-#define GUEST_INTR_STATE_NMI		0x00000008
-
-/*
- * Exit Qualifications for MOV for Control Register Access
- */
-#define CONTROL_REG_ACCESS_NUM          0x7     /* 2:0, number of control reg.*/
-#define CONTROL_REG_ACCESS_TYPE         0x30    /* 5:4, access type */
-#define CONTROL_REG_ACCESS_REG          0xf00   /* 10:8, general purpose reg. */
-#define LMSW_SOURCE_DATA_SHIFT 16
-#define LMSW_SOURCE_DATA  (0xFFFF << LMSW_SOURCE_DATA_SHIFT) /* 16:31 lmsw source */
-#define REG_EAX                         (0 << 8)
-#define REG_ECX                         (1 << 8)
-#define REG_EDX                         (2 << 8)
-#define REG_EBX                         (3 << 8)
-#define REG_ESP                         (4 << 8)
-#define REG_EBP                         (5 << 8)
-#define REG_ESI                         (6 << 8)
-#define REG_EDI                         (7 << 8)
-#define REG_R8                         (8 << 8)
-#define REG_R9                         (9 << 8)
-#define REG_R10                        (10 << 8)
-#define REG_R11                        (11 << 8)
-#define REG_R12                        (12 << 8)
-#define REG_R13                        (13 << 8)
-#define REG_R14                        (14 << 8)
-#define REG_R15                        (15 << 8)
-
-/*
- * Exit Qualifications for MOV for Debug Register Access
- */
-#define DEBUG_REG_ACCESS_NUM            0x7     /* 2:0, number of debug reg. */
-#define DEBUG_REG_ACCESS_TYPE           0x10    /* 4, direction of access */
-#define TYPE_MOV_TO_DR                  (0 << 4)
-#define TYPE_MOV_FROM_DR                (1 << 4)
-#define DEBUG_REG_ACCESS_REG            0xf00   /* 11:8, general purpose reg. */
-
-
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
-
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
-
-#define VMX_NR_VPIDS				(1 << 16)
-#define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
-#define VMX_VPID_EXTENT_ALL_CONTEXT		2
-
-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
-#define VMX_EPT_EXTENT_CONTEXT			1
-#define VMX_EPT_EXTENT_GLOBAL			2
-#define VMX_EPT_EXTENT_INDIVIDUAL_BIT		(1ull << 24)
-#define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
-#define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
-#define VMX_EPT_DEFAULT_GAW			3
-#define VMX_EPT_MAX_GAW				0x4
-#define VMX_EPT_MT_EPTE_SHIFT			3
-#define VMX_EPT_GAW_EPTP_SHIFT			3
-#define VMX_EPT_DEFAULT_MT			0x6ull
-#define VMX_EPT_READABLE_MASK			0x1ull
-#define VMX_EPT_WRITABLE_MASK			0x2ull
-#define VMX_EPT_EXECUTABLE_MASK			0x4ull
-#define VMX_EPT_IGMT_BIT    			(1ull << 6)
-
-#define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
-
-#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..cc17546a2406 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,11 +34,13 @@
 #include <linux/module.h>
 #include <linux/mman.h>
 #include <linux/highmem.h>
+#include <linux/iommu.h>
 #include <linux/intel-iommu.h>
 
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
+	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
+	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
+	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
@@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +456,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_IA32_PERF_STATUS,
+	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
 };
 
 static unsigned num_msrs_to_save;
@@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr)
 
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
 	if (!msr_mtrr_valid(msr))
 		return 1;
 
-	vcpu->arch.mtrr[msr - 0x200] = data;
+	if (msr == MSR_MTRRdefType) {
+		vcpu->arch.mtrr_state.def_type = data;
+		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+	} else if (msr == MSR_MTRRfix64K_00000)
+		p[0] = data;
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		p[1 + msr - MSR_MTRRfix16K_80000] = data;
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		u64 *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pt = data;
+	}
+
+	kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 
@@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
+	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
 	if (!msr_mtrr_valid(msr))
 		return 1;
 
-	*pdata = vcpu->arch.mtrr[msr - 0x200];
+	if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.def_type +
+			 (vcpu->arch.mtrr_state.enabled << 10);
+	else if (msr == MSR_MTRRfix64K_00000)
+		*pdata = p[0];
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		u64 *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pdata = *pt;
+	}
+
 	return 0;
 }
 
@@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_HLT:
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
@@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = !tdp_enabled;
 		break;
 	case KVM_CAP_IOMMU:
-		r = intel_iommu_found();
+		r = iommu_found();
 		break;
 	default:
 		r = 0;
@@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		int t, times = entry->eax & 0xff;
 
 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
 		for (t = 1; t < times && *nent < maxnent; ++t) {
 			do_cpuid_1_ent(&entry[t], function, 0);
 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until level_type is zero */
 		for (i = 1; *nent < maxnent; ++i) {
-			level_type = entry[i - 1].ecx & 0xff;
+			level_type = entry[i - 1].ecx & 0xff00;
 			if (!level_type)
 				break;
 			do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_inject_nmi(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_NMI: {
+		r = kvm_vcpu_ioctl_nmi(vcpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
@@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
 	return 1;
 }
 
@@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	memcpy(vcpu->arch.pio_data, &val, 4);
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-
 	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
 	if (pio_dev) {
 		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-			PT_DIRTY_MASK, PT64_NX_MASK, 0);
+			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
 	return 0;
 
 out:
@@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 
 	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
 	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; j == i; j = (j + 1) % nent) {
+	for (j = i + 1; ; j = (j + 1) % nent) {
 		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
 		if (ej->function == e->function) {
 			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
-		r = kvm_x86_ops->vcpu_reset(vcpu);
+		r = kvm_arch_vcpu_reset(vcpu);
 		if (r)
 			return r;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 	kvm_desct->padding = 0;
 }
 
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
-					   u16 selector,
-					   struct descriptor_table *dtable)
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+					  u16 selector,
+					  struct descriptor_table *dtable)
 {
 	if (selector & 1 << 2) {
 		struct kvm_segment kvm_seg;
@@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 	struct descriptor_table dtable;
 	u16 index = selector >> 3;
 
-	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
 	if (dtable.limit < index * 8 + 7) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 	struct descriptor_table dtable;
 	u16 index = selector >> 3;
 
-	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
 	if (dtable.limit < index * 8 + 7)
 		return 1;
@@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* We do fxsave: this must be aligned. */
 	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
 
+	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu_load(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
 	if (r == 0)
@@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.nmi_pending = false;
+	vcpu->arch.nmi_injected = false;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 
@@ -4012,6 +4092,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-	kvm_iommu_unmap_guest(kvm);
 	kvm_free_all_assigned_devices(kvm);
+	kvm_iommu_unmap_guest(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
@@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+	       || vcpu->arch.nmi_pending;
 }
 
 static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
 #define SrcMem32    (4<<4)	/* Memory operand (32-bit). */
 #define SrcImm      (5<<4)	/* Immediate operand. */
 #define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<4)	/* Implied '1' */
 #define SrcMask     (7<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<7)
@@ -70,17 +71,23 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None    (0<<29)
+#define Src2CL      (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One     (3<<29)
+#define Src2Mask    (7<<29)
 
 enum {
 	Group1_80, Group1_81, Group1_82, Group1_83,
 	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
 
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
 	/* 0x08 - 0x0F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
 	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+	DstMem | SrcReg | Src2ImmByte | ModRM,
+	DstMem | SrcReg | Src2CL | ModRM, 0, 0,
 	/* 0xA8 - 0xAF */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+	DstMem | SrcReg | Src2ImmByte | ModRM,
+	DstMem | SrcReg | Src2CL | ModRM,
+	ModRM, 0,
 	/* 0xB0 - 0xB7 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
 	    DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-static u16 group_table[] = {
+static u32 group_table[] = {
 	[Group1_80*8] =
 	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
 	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
 	SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
 
-static u16 group2_table[] = {
+static u32 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM, 0, 0, 0,
+	SrcNone | ModRM, 0, 0, SrcNone | ModRM,
 	SrcNone | ModRM | DstMem | Mov, 0,
 	SrcMem16 | ModRM | Mov, 0,
 };
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
 	"andl %"_msk",%"_LO32 _tmp"; "		\
 	"orl  %"_LO32 _tmp",%"_sav"; "
 
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)	\
+	do {								\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "4", "2")			\
+			_op _suffix " %"_x"3,%1; "			\
+			_POST_EFLAGS("0", "4", "2")			\
+			: "=m" (_eflags), "=m" ((_dst).val),		\
+			  "=&r" (_tmp)					\
+			: _y ((_src).val), "i" (EFLAGS_MASK));		\
+	} while (0)
+
+
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do { 								    \
-		unsigned long _tmp;					    \
-									    \
-		switch ((_dst).bytes) {					    \
-		case 2:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"w %"_wx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),        \
-				  "=&r" (_tmp)				    \
-				: _wy ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 4:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"l %"_lx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),	    \
-				  "=&r" (_tmp)				    \
-				: _ly ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 8:							    \
-			__emulate_2op_8byte(_op, _src, _dst,		    \
-					    _eflags, _qx, _qy);		    \
-			break;						    \
-		}							    \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		switch ((_dst).bytes) {					\
+		case 2:							\
+			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+			break;						\
+		case 4:							\
+			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+			break;						\
+		case 8:							\
+			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+			break;						\
+		}							\
 	} while (0)
 
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
 	do {								     \
-		unsigned long __tmp;					     \
+		unsigned long _tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
-			__asm__ __volatile__ (				     \
-				_PRE_EFLAGS("0", "4", "2")		     \
-				_op"b %"_bx"3,%1; "			     \
-				_POST_EFLAGS("0", "4", "2")		     \
-				: "=m" (_eflags), "=m" ((_dst).val),	     \
-				  "=&r" (__tmp)				     \
-				: _by ((_src).val), "i" (EFLAGS_MASK));      \
+			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
 			break;						     \
 		default:						     \
 			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
 	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
 			     "w", "r", _LO32, "r", "", "r")
 
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
-	do {								\
-		unsigned long _tmp;					\
-									\
-		switch ((_dst).bytes) {				        \
-		case 1:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"b %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 2:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"w %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 4:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"l %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 8:							\
-			__emulate_1op_8byte(_op, _dst, _eflags);	\
-			break;						\
-		}							\
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) 	\
+	do {									\
+		unsigned long _tmp;						\
+		_type _clv  = (_cl).val;  					\
+		_type _srcv = (_src).val;    					\
+		_type _dstv = (_dst).val;					\
+										\
+		__asm__ __volatile__ (						\
+			_PRE_EFLAGS("0", "5", "2")				\
+			_op _suffix " %4,%1 \n"					\
+			_POST_EFLAGS("0", "5", "2")				\
+			: "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)		\
+			: "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)		\
+			); 							\
+										\
+		(_cl).val  = (unsigned long) _clv;				\
+		(_src).val = (unsigned long) _srcv;				\
+		(_dst).val = (unsigned long) _dstv;				\
 	} while (0)
 
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "4", "2")			  \
-			_op"q %"_qx"3,%1; "				  \
-			_POST_EFLAGS("0", "4", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: _qy ((_src).val), "i" (EFLAGS_MASK));		\
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)				\
+	do {									\
+		switch ((_dst).bytes) {						\
+		case 2:								\
+			__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,  	\
+						"w", unsigned short);         	\
+			break;							\
+		case 4: 							\
+			__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,  	\
+						"l", unsigned int);           	\
+			break;							\
+		case 8:								\
+			ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,	\
+						"q", unsigned long));  		\
+			break;							\
+		}								\
 	} while (0)
 
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "3", "2")			  \
-			_op"q %1; "					  \
-			_POST_EFLAGS("0", "3", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: "i" (EFLAGS_MASK));				  \
+#define __emulate_1op(_op, _dst, _eflags, _suffix)			\
+	do {								\
+		unsigned long _tmp;					\
+									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "3", "2")			\
+			_op _suffix " %1; "				\
+			_POST_EFLAGS("0", "3", "2")			\
+			: "=m" (_eflags), "+m" ((_dst).val),		\
+			  "=&r" (_tmp)					\
+			: "i" (EFLAGS_MASK));				\
 	} while (0)
 
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif				/* __i386__ */
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+	do {								\
+		switch ((_dst).bytes) {				        \
+		case 1:	__emulate_1op(_op, _dst, _eflags, "b"); break;	\
+		case 2:	__emulate_1op(_op, _dst, _eflags, "w"); break;	\
+		case 4:	__emulate_1op(_op, _dst, _eflags, "l"); break;	\
+		case 8:	ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+		}							\
+	} while (0)
 
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
@@ -1041,6 +1049,33 @@ done_prefixes:
 		c->src.bytes = 1;
 		c->src.val = insn_fetch(s8, 1, c->eip);
 		break;
+	case SrcOne:
+		c->src.bytes = 1;
+		c->src.val = 1;
+		break;
+	}
+
+	/*
+	 * Decode and fetch the second source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & Src2Mask) {
+	case Src2None:
+		break;
+	case Src2CL:
+		c->src2.bytes = 1;
+		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		break;
+	case Src2ImmByte:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 1;
+		c->src2.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case Src2One:
+		c->src2.bytes = 1;
+		c->src2.val = 1;
+		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 					       c->regs[VCPU_REGS_RSP]);
 }
 
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops)
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = ops->read_std(register_address(c, ss_base(ctxt),
-					    c->regs[VCPU_REGS_RSP]),
-			   &c->dst.val, c->dst.bytes, ctxt->vcpu);
+	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+						 c->regs[VCPU_REGS_RSP]),
+				&c->src.val, c->src.bytes, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+	return rc;
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
 
+	c->src.bytes = c->dst.bytes;
+	rc = emulate_pop(ctxt, ops);
+	if (rc != 0)
+		return rc;
+	c->dst.val = c->src.val;
 	return 0;
 }
 
@@ -1415,24 +1463,15 @@ special_insn:
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 	case 0x50 ... 0x57:  /* push reg */
-		c->dst.type  = OP_MEM;
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = c->src.val;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-					   -c->op_bytes);
-		c->dst.ptr = (void *) register_address(
-			c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
+		emulate_push(ctxt);
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
-			c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-			c->op_bytes, ctxt->vcpu)) != 0)
+		c->src.bytes = c->op_bytes;
+		rc = emulate_pop(ctxt, ops);
+		if (rc != 0)
 			goto done;
-
-		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-					   c->op_bytes);
-		c->dst.type = OP_NONE;	/* Disable writeback. */
+		c->dst.val = c->src.val;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
 		emulate_push(ctxt);
 		break;
 	case 0x9d: /* popf */
+		c->dst.type = OP_REG;
 		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xa0 ... 0xa1:	/* mov */
 		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
 		emulate_grp2(ctxt);
 		break;
 	case 0xc3: /* ret */
+		c->dst.type = OP_REG;
 		c->dst.ptr = &c->eip;
+		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
 	mov:
@@ -1778,7 +1821,7 @@ special_insn:
 			c->eip = saved_eip;
 			goto cannot_emulate;
 		}
-		return 0;
+		break;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
 		break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
 		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xa4: /* shld imm8, r, r/m */
+	case 0xa5: /* shld cl, r, r/m */
+		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+		break;
 	case 0xab:
 	      bts:		/* bts */
 		/* only subword offset */
 		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xac: /* shrd imm8, r, r/m */
+	case 0xad: /* shrd cl, r, r/m */
+		emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+		break;
 	case 0xae:              /* clflush */
 		break;
 	case 0xb0 ... 0xb1:	/* cmpxchg */