From b3ae2096974b12c3af2ad1a4e7716b084949867f Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Mon, 2 Jul 2012 17:56:33 +0900
Subject: KVM: Introduce kvm_unmap_hva_range() for
 kvm_mmu_notifier_invalidate_range_start()

When we tested KVM under memory pressure, with THP enabled on the host,
we noticed that MMU notifier took a long time to invalidate huge pages.

Since the invalidation was done with mmu_lock held, it not only wasted
the CPU but also made the host harder to respond.

This patch mitigates this by using kvm_handle_hva_range().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Cc: Alexander Graf <agraf@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b3ce91c623e2..e2b1a159e5df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -332,8 +332,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 	 * count is also read inside the mmu_lock critical section.
 	 */
 	kvm->mmu_notifier_count++;
-	for (; start < end; start += PAGE_SIZE)
-		need_tlb_flush |= kvm_unmap_hva(kvm, start);
+	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
 	need_tlb_flush |= kvm->tlbs_dirty;
 	/* we've to flush the tlb before the pages can be freed */
 	if (need_tlb_flush)
-- 
cgit v1.2.1


From 903816fa4d016e20ec71a1a97700cfcdda115580 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:54:11 +0800
Subject: KVM: using get_fault_pfn to get the fault pfn

Using get_fault_pfn to cleanup the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e2b1a159e5df..0fbbf2d21603 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -103,8 +103,8 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
-struct page *fault_page;
-pfn_t fault_pfn;
+static struct page *fault_page;
+static pfn_t fault_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
@@ -949,12 +949,6 @@ int is_hwpoison_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 
-int is_fault_pfn(pfn_t pfn)
-{
-	return pfn == fault_pfn;
-}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
-
 int is_noslot_pfn(pfn_t pfn)
 {
 	return pfn == bad_pfn;
@@ -1038,11 +1032,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-static pfn_t get_fault_pfn(void)
+pfn_t get_fault_pfn(void)
 {
 	get_page(fault_page);
 	return fault_pfn;
 }
+EXPORT_SYMBOL_GPL(get_fault_pfn);
 
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	unsigned long start, int write, struct page **page)
-- 
cgit v1.2.1


From ca0565f5736e67af3172d188577b57e303dd082a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:54:52 +0800
Subject: KVM: make bad_pfn static to kvm_main.c

bad_pfn is not used out of kvm_main.c, so mark it static, also move it near
hwpoison_pfn and fault_pfn

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0fbbf2d21603..f955eee92aa9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,6 +100,9 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
+struct page *bad_page;
+static pfn_t bad_pfn;
+
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
@@ -2691,9 +2694,6 @@ static struct syscore_ops kvm_syscore_ops = {
 	.resume = kvm_resume,
 };
 
-struct page *bad_page;
-pfn_t bad_pfn;
-
 static inline
 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 {
-- 
cgit v1.2.1


From d566104853361cc377c61f70e41c1ad3d44b86c6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 21:56:16 +0800
Subject: KVM: remove the unused parameter of gfn_to_pfn_memslot

The parameter, 'kvm', is not used in gfn_to_pfn_memslot, we can happily remove
it

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/iommu.c    | 10 +++++-----
 virt/kvm/kvm_main.c | 15 +++++++--------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index e9fff9830bf0..c03f1fb26701 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
 static void kvm_iommu_put_pages(struct kvm *kvm,
 				gfn_t base_gfn, unsigned long npages);
 
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
-			   gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+			   unsigned long size)
 {
 	gfn_t end_gfn;
 	pfn_t pfn;
 
-	pfn     = gfn_to_pfn_memslot(kvm, slot, gfn);
+	pfn     = gfn_to_pfn_memslot(slot, gfn);
 	end_gfn = gfn + (size >> PAGE_SHIFT);
 	gfn    += 1;
 
@@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
 		return pfn;
 
 	while (gfn < end_gfn)
-		gfn_to_pfn_memslot(kvm, slot, gfn++);
+		gfn_to_pfn_memslot(slot, gfn++);
 
 	return pfn;
 }
@@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 		 * Pin all pages we are about to map in memory. This is
 		 * important because we unmap and unpin in 4kb steps later.
 		 */
-		pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+		pfn = kvm_pin_pages(slot, gfn, page_size);
 		if (is_error_pfn(pfn)) {
 			gfn += 1;
 			continue;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f955eee92aa9..68dda513cd72 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1062,8 +1062,8 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 	return rc == -EHWPOISON;
 }
 
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
-			bool *async, bool write_fault, bool *writable)
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+			bool write_fault, bool *writable)
 {
 	struct page *page[1];
 	int npages = 0;
@@ -1143,9 +1143,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
 	return pfn;
 }
 
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+pfn_t hva_to_pfn_atomic(unsigned long addr)
 {
-	return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+	return hva_to_pfn(addr, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
 
@@ -1163,7 +1163,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 		return page_to_pfn(bad_page);
 	}
 
-	return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+	return hva_to_pfn(addr, atomic, async, write_fault, writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1192,11 +1192,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
-			 struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
 	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+	return hva_to_pfn(addr, false, NULL, true, NULL);
 }
 
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
-- 
cgit v1.2.1


From 93b6547e2219784b2df790353e083e0bdbebd2c2 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Jul 2012 13:55:53 +0300
Subject: KVM: switch to symbolic name for irq_states size

Use PIC_NUM_PINS instead of hard-coded 16 for pic pins.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/irq_comm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a6a0365475ed..22aae8fd146b 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -343,11 +343,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 		switch (ue->u.irqchip.irqchip) {
 		case KVM_IRQCHIP_PIC_MASTER:
 			e->set = kvm_set_pic_irq;
-			max_pin = 16;
+			max_pin = PIC_NUM_PINS;
 			break;
 		case KVM_IRQCHIP_PIC_SLAVE:
 			e->set = kvm_set_pic_irq;
-			max_pin = 16;
+			max_pin = PIC_NUM_PINS;
 			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-- 
cgit v1.2.1


From f2a743473194a1ad44a85f8b63aeef9d63e5bf47 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 18 Jul 2012 19:07:32 +0530
Subject: KVM: Add config to support ple or cpu relax optimzation

Suggested-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> # on s390x
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 28694f4a9139..d01b24b72c61 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -21,3 +21,6 @@ config KVM_ASYNC_PF
 
 config HAVE_KVM_MSI
        bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+       bool
-- 
cgit v1.2.1


From 4c088493c8d07e4e27bad53a99dcfdc14cdf45f8 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Wed, 18 Jul 2012 19:07:46 +0530
Subject: KVM: Note down when cpu relax intercepted or pause loop exited

Noting pause loop exited vcpu or cpu relax intercepted helps in
filtering right candidate to yield. Wrong selection of vcpu;
i.e., a vcpu that just did a pl-exit or cpu relax intercepted may
contribute to performance degradation.

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> # on s390x
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 68dda513cd72..0892b75eeedd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -239,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	}
 	vcpu->run = page_address(page);
 
+	kvm_vcpu_set_in_spin_loop(vcpu, false);
+	kvm_vcpu_set_dy_eligible(vcpu, false);
+
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
 		goto fail_free_run;
@@ -1585,6 +1588,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 	int pass;
 	int i;
 
+	kvm_vcpu_set_in_spin_loop(me, true);
 	/*
 	 * We boost the priority of a VCPU that is runnable but not
 	 * currently running, because it got preempted by something
@@ -1610,6 +1614,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 			}
 		}
 	}
+	kvm_vcpu_set_in_spin_loop(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-- 
cgit v1.2.1


From 06e48c510aa37f6e791602e6420422ea7071fe94 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Thu, 19 Jul 2012 15:17:52 +0530
Subject: KVM: Choose better candidate for directed yield

Currently, on a large vcpu guests, there is a high probability of
yielding to the same vcpu who had recently done a pause-loop exit or
cpu relax intercepted. Such a yield can lead to the vcpu spinning
again and hence degrade the performance.

The patchset keeps track of the pause loop exit/cpu relax interception
and gives chance to a vcpu which:
 (a) Has not done pause loop exit or cpu relax intercepted at all
     (probably he is preempted lock-holder)
 (b) Was skipped in last iteration because it did pause loop exit or
     cpu relax intercepted, and probably has become eligible now
     (next eligible lock holder)

Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com> # on s390x
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0892b75eeedd..1e10ebe1a370 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1579,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
 
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ *  (preempted lock holder), indicated by @in_spin_loop.
+ *  Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ *  chance last time (mostly it has become eligible now since we have probably
+ *  yielded to lockholder in last iteration. This is done by toggling
+ *  @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ *  to preempted lock-holder could result in wrong VCPU selection and CPU
+ *  burning. Giving priority for a potential lock-holder increases lock
+ *  progress.
+ *
+ *  Since algorithm is based on heuristics, accessing another VCPU data without
+ *  locking does not harm. It may result in trying to yield to  same VCPU, fail
+ *  and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+	bool eligible;
+
+	eligible = !vcpu->spin_loop.in_spin_loop ||
+			(vcpu->spin_loop.in_spin_loop &&
+			 vcpu->spin_loop.dy_eligible);
+
+	if (vcpu->spin_loop.in_spin_loop)
+		kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+	return eligible;
+}
+#endif
 void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 {
 	struct kvm *kvm = me->kvm;
@@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 				continue;
 			if (waitqueue_active(&vcpu->wq))
 				continue;
+			if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+				continue;
 			if (kvm_vcpu_yield_to(vcpu)) {
 				kvm->last_boosted_vcpu = i;
 				yielded = 1;
@@ -1615,6 +1654,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
 		}
 	}
 	kvm_vcpu_set_in_spin_loop(me, false);
+
+	/* Ensure vcpu is not eligible during next spinloop */
+	kvm_vcpu_set_dy_eligible(me, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
 
-- 
cgit v1.2.1


From 2b4b5af8f8e7296bc27c52023ab6bb8f53db3a2b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 26 Jul 2012 11:58:17 +0800
Subject: KVM: use kvm_release_page_clean to release the page

In kvm_async_pf_wakeup_all, it uses bad_page to generate broadcast wakeup,
and uses put_page to release bad_page, the work depends on the fact that
bad_page is the normal page. But we will use the error code instead of
bad_page, so use kvm_release_page_clean to release the page which will
release the error code properly

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/async_pf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4c2ee1..ebae24b62c90 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -112,7 +112,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
 				   typeof(*work), link);
 		list_del(&work->link);
 		if (work->page)
-			put_page(work->page);
+			kvm_release_page_clean(work->page);
 		kmem_cache_free(async_pf_cache, work);
 	}
 	spin_unlock(&vcpu->async_pf.lock);
@@ -139,7 +139,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 		list_del(&work->queue);
 		vcpu->async_pf.queued--;
 		if (work->page)
-			put_page(work->page);
+			kvm_release_page_clean(work->page);
 		kmem_cache_free(async_pf_cache, work);
 	}
 }
-- 
cgit v1.2.1


From a2766325cf9f9e36d1225145f1ce1b066f001837 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Thu, 26 Jul 2012 11:58:59 +0800
Subject: KVM: remove dummy pages

Currently, kvm allocates some pages and use them as error indicators,
it wastes memory and is not good for scalability

Base on Avi's suggestion, we use the error codes instead of these pages
to indicate the error conditions

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/async_pf.c |   3 +-
 virt/kvm/kvm_main.c | 121 +++++++++++++++++++++-------------------------------
 2 files changed, 50 insertions(+), 74 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index ebae24b62c90..79722782d9d7 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 	if (!work)
 		return -ENOMEM;
 
-	work->page = bad_page;
-	get_page(bad_page);
+	work->page = get_bad_page();
 	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0014ee99dc7f..de89497fe4c7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -100,17 +100,11 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);
 
 static bool largepages_enabled = true;
 
-struct page *bad_page;
-static pfn_t bad_pfn;
-
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-static struct page *fault_page;
-static pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
 {
+	if (is_error_pfn(pfn))
+		return false;
+
 	if (pfn_valid(pfn)) {
 		int reserved;
 		struct page *tail = pfn_to_page(pfn);
@@ -939,34 +933,55 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 int is_error_page(struct page *page)
 {
-	return page == bad_page || page == hwpoison_page || page == fault_page;
+	return IS_ERR(page);
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
 int is_error_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+	return IS_ERR_VALUE(pfn);
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
+static pfn_t get_bad_pfn(void)
+{
+	return -ENOENT;
+}
+
+pfn_t get_fault_pfn(void)
+{
+	return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(get_fault_pfn);
+
+static pfn_t get_hwpoison_pfn(void)
+{
+	return -EHWPOISON;
+}
+
 int is_hwpoison_pfn(pfn_t pfn)
 {
-	return pfn == hwpoison_pfn;
+	return pfn == -EHWPOISON;
 }
 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 
 int is_noslot_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn;
+	return pfn == -ENOENT;
 }
 EXPORT_SYMBOL_GPL(is_noslot_pfn);
 
 int is_invalid_pfn(pfn_t pfn)
 {
-	return pfn == hwpoison_pfn || pfn == fault_pfn;
+	return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
 }
 EXPORT_SYMBOL_GPL(is_invalid_pfn);
 
+struct page *get_bad_page(void)
+{
+	return ERR_PTR(-ENOENT);
+}
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -1038,13 +1053,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
-pfn_t get_fault_pfn(void)
-{
-	get_page(fault_page);
-	return fault_pfn;
-}
-EXPORT_SYMBOL_GPL(get_fault_pfn);
-
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	unsigned long start, int write, struct page **page)
 {
@@ -1122,8 +1130,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		if (npages == -EHWPOISON ||
 			(!async && check_user_page_hwpoison(addr))) {
 			up_read(&current->mm->mmap_sem);
-			get_page(hwpoison_page);
-			return page_to_pfn(hwpoison_page);
+			return get_hwpoison_pfn();
 		}
 
 		vma = find_vma_intersection(current->mm, addr, addr+1);
@@ -1161,10 +1168,8 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 		*async = false;
 
 	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr)) {
-		get_page(bad_page);
-		return page_to_pfn(bad_page);
-	}
+	if (kvm_is_error_hva(addr))
+		return get_bad_pfn();
 
 	return hva_to_pfn(addr, atomic, async, write_fault, writable);
 }
@@ -1218,37 +1223,45 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 }
 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+	WARN_ON(kvm_is_mmio_pfn(pfn));
+
+	if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn))
+		return get_bad_page();
+
+	return pfn_to_page(pfn);
+}
+
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 {
 	pfn_t pfn;
 
 	pfn = gfn_to_pfn(kvm, gfn);
-	if (!kvm_is_mmio_pfn(pfn))
-		return pfn_to_page(pfn);
-
-	WARN_ON(kvm_is_mmio_pfn(pfn));
 
-	get_page(bad_page);
-	return bad_page;
+	return kvm_pfn_to_page(pfn);
 }
 
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
 {
-	kvm_release_pfn_clean(page_to_pfn(page));
+	if (!is_error_page(page))
+		kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (!kvm_is_mmio_pfn(pfn))
+	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 
 void kvm_release_page_dirty(struct page *page)
 {
+	WARN_ON(is_error_page(page));
+
 	kvm_release_pfn_dirty(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
@@ -2771,33 +2784,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	if (r)
 		goto out_fail;
 
-	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-	if (bad_page == NULL) {
-		r = -ENOMEM;
-		goto out;
-	}
-
-	bad_pfn = page_to_pfn(bad_page);
-
-	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-	if (hwpoison_page == NULL) {
-		r = -ENOMEM;
-		goto out_free_0;
-	}
-
-	hwpoison_pfn = page_to_pfn(hwpoison_page);
-
-	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
-	if (fault_page == NULL) {
-		r = -ENOMEM;
-		goto out_free_0;
-	}
-
-	fault_pfn = page_to_pfn(fault_page);
-
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
@@ -2872,12 +2858,6 @@ out_free_1:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
-	if (fault_page)
-		__free_page(fault_page);
-	if (hwpoison_page)
-		__free_page(hwpoison_page);
-	__free_page(bad_page);
-out:
 	kvm_arch_exit();
 out_fail:
 	return r;
@@ -2897,8 +2877,5 @@ void kvm_exit(void)
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
-	__free_page(fault_page);
-	__free_page(hwpoison_page);
-	__free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-- 
cgit v1.2.1


From 23d43cf998275bc97437931c0cdee1df2c1aa3ca Mon Sep 17 00:00:00 2001
From: Christoffer Dall <c.dall@virtualopensystems.com>
Date: Tue, 24 Jul 2012 08:51:20 -0400
Subject: KVM: Move KVM_IRQ_LINE to arch-generic code

Handle KVM_IRQ_LINE and KVM_IRQ_LINE_STATUS in the generic
kvm_vm_ioctl() function and call into kvm_vm_ioctl_irq_line().

This is even more relevant when KVM/ARM also uses this ioctl.

Signed-off-by: Christoffer Dall <c.dall@virtualopensystems.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index de89497fe4c7..bcf973ec98ff 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2148,6 +2148,29 @@ static long kvm_vm_ioctl(struct file *filp,
 		r = kvm_send_userspace_msi(kvm, &msi);
 		break;
 	}
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+	case KVM_IRQ_LINE_STATUS:
+	case KVM_IRQ_LINE: {
+		struct kvm_irq_level irq_event;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq_event, argp, sizeof irq_event))
+			goto out;
+
+		r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (ioctl == KVM_IRQ_LINE_STATUS) {
+			if (copy_to_user(argp, &irq_event, sizeof irq_event))
+				goto out;
+		}
+
+		r = 0;
+		break;
+	}
 #endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
-- 
cgit v1.2.1


From d89cc617b954aff4030fce178f7d86f59aaf713d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 1 Aug 2012 18:03:28 +0900
Subject: KVM: Push rmap into kvm_arch_memory_slot

Two reasons:
 - x86 can integrate rmap and rmap_pde and remove heuristics in
   __gfn_to_rmap().
 - Some architectures do not need rmap.

Since rmap is one of the most memory consuming stuff in KVM, ppc'd
better restrict the allocation to Book3S HV.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bcf973ec98ff..14ec567816ab 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -550,16 +550,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
 				  struct kvm_memory_slot *dont)
 {
-	if (!dont || free->rmap != dont->rmap)
-		vfree(free->rmap);
-
 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
 		kvm_destroy_dirty_bitmap(free);
 
 	kvm_arch_free_memslot(free, dont);
 
 	free->npages = 0;
-	free->rmap = NULL;
 }
 
 void kvm_free_physmem(struct kvm *kvm)
@@ -768,11 +764,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (npages && !old.npages) {
 		new.user_alloc = user_alloc;
 		new.userspace_addr = mem->userspace_addr;
-#ifndef CONFIG_S390
-		new.rmap = vzalloc(npages * sizeof(*new.rmap));
-		if (!new.rmap)
-			goto out_free;
-#endif /* not defined CONFIG_S390 */
+
 		if (kvm_arch_create_memslot(&new, npages))
 			goto out_free;
 	}
@@ -831,7 +823,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	/* actual memory is freed via old in kvm_free_physmem_slot below */
 	if (!npages) {
-		new.rmap = NULL;
 		new.dirty_bitmap = NULL;
 		memset(&new.arch, 0, sizeof(new.arch));
 	}
-- 
cgit v1.2.1


From 16b854c889673d988285719bc020543a730f9ac6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:36:52 +0800
Subject: KVM: iommu: fix releasing unmapped page

There are two bugs:
- the 'error page' is forgot to be released
  [ it is unneeded after commit a2766325cf9f9, for backport, we
    still do kvm_release_pfn_clean for the error pfn ]

- guest pages are always released regardless of the unmapped page
  (e,g, caused by hwpoison)

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/iommu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index c03f1fb26701..6a67bea4019c 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -107,6 +107,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 		 */
 		pfn = kvm_pin_pages(slot, gfn, page_size);
 		if (is_error_pfn(pfn)) {
+			kvm_release_pfn_clean(pfn);
 			gfn += 1;
 			continue;
 		}
@@ -300,6 +301,12 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 
 		/* Get physical address */
 		phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+		if (!phys) {
+			gfn++;
+			continue;
+		}
+
 		pfn  = phys >> PAGE_SHIFT;
 
 		/* Unmap address from IO address space */
-- 
cgit v1.2.1


From 6c8ee57be9350c5c2cafdd6a99d0462d528676e2 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:37:54 +0800
Subject: KVM: introduce KVM_PFN_ERR_FAULT

After that, the exported and un-inline function, get_fault_pfn,
can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14ec567816ab..ef0491645a10 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -939,12 +939,6 @@ static pfn_t get_bad_pfn(void)
 	return -ENOENT;
 }
 
-pfn_t get_fault_pfn(void)
-{
-	return -EFAULT;
-}
-EXPORT_SYMBOL_GPL(get_fault_pfn);
-
 static pfn_t get_hwpoison_pfn(void)
 {
 	return -EHWPOISON;
@@ -1115,7 +1109,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		struct vm_area_struct *vma;
 
 		if (atomic)
-			return get_fault_pfn();
+			return KVM_PFN_ERR_FAULT;
 
 		down_read(&current->mm->mmap_sem);
 		if (npages == -EHWPOISON ||
@@ -1127,7 +1121,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		vma = find_vma_intersection(current->mm, addr, addr+1);
 
 		if (vma == NULL)
-			pfn = get_fault_pfn();
+			pfn = KVM_PFN_ERR_FAULT;
 		else if ((vma->vm_flags & VM_PFNMAP)) {
 			pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 				vma->vm_pgoff;
@@ -1135,7 +1129,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		} else {
 			if (async && (vma->vm_flags & VM_WRITE))
 				*async = true;
-			pfn = get_fault_pfn();
+			pfn = KVM_PFN_ERR_FAULT;
 		}
 		up_read(&current->mm->mmap_sem);
 	} else
-- 
cgit v1.2.1


From e6c1502b3f933ace20c711ce34ab696f5a67086d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:38:36 +0800
Subject: KVM: introduce KVM_PFN_ERR_HWPOISON

Then, get_hwpoison_pfn and is_hwpoison_pfn can be removed

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef0491645a10..7fce2d5787ae 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -939,17 +939,6 @@ static pfn_t get_bad_pfn(void)
 	return -ENOENT;
 }
 
-static pfn_t get_hwpoison_pfn(void)
-{
-	return -EHWPOISON;
-}
-
-int is_hwpoison_pfn(pfn_t pfn)
-{
-	return pfn == -EHWPOISON;
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-
 int is_noslot_pfn(pfn_t pfn)
 {
 	return pfn == -ENOENT;
@@ -1115,7 +1104,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 		if (npages == -EHWPOISON ||
 			(!async && check_user_page_hwpoison(addr))) {
 			up_read(&current->mm->mmap_sem);
-			return get_hwpoison_pfn();
+			return KVM_PFN_ERR_HWPOISON;
 		}
 
 		vma = find_vma_intersection(current->mm, addr, addr+1);
-- 
cgit v1.2.1


From 950e95097b1c6573ef5e21061ccb56964278c45b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:39:19 +0800
Subject: KVM: introduce KVM_PFN_ERR_BAD

Then, remove get_bad_pfn

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7fce2d5787ae..988b2339d846 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -934,11 +934,6 @@ int is_error_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
-static pfn_t get_bad_pfn(void)
-{
-	return -ENOENT;
-}
-
 int is_noslot_pfn(pfn_t pfn)
 {
 	return pfn == -ENOENT;
@@ -1143,7 +1138,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 
 	addr = gfn_to_hva(kvm, gfn);
 	if (kvm_is_error_hva(addr))
-		return get_bad_pfn();
+		return KVM_PFN_ERR_BAD;
 
 	return hva_to_pfn(addr, atomic, async, write_fault, writable);
 }
-- 
cgit v1.2.1


From 83f09228d068911ac8797ae8d6febef886698936 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:39:59 +0800
Subject: KVM: inline is_*_pfn functions

These functions are exported and can not inline, move them
to kvm_host.h to eliminate the overload of function call

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 988b2339d846..eb73e5f13678 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -928,24 +928,6 @@ int is_error_page(struct page *page)
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
-int is_error_pfn(pfn_t pfn)
-{
-	return IS_ERR_VALUE(pfn);
-}
-EXPORT_SYMBOL_GPL(is_error_pfn);
-
-int is_noslot_pfn(pfn_t pfn)
-{
-	return pfn == -ENOENT;
-}
-EXPORT_SYMBOL_GPL(is_noslot_pfn);
-
-int is_invalid_pfn(pfn_t pfn)
-{
-	return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
-}
-EXPORT_SYMBOL_GPL(is_invalid_pfn);
-
 struct page *get_bad_page(void)
 {
 	return ERR_PTR(-ENOENT);
-- 
cgit v1.2.1


From 6cede2e6794be6b0649f62d3681e0c4aff5a9270 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:41:22 +0800
Subject: KVM: introduce KVM_ERR_PTR_BAD_PAGE

It is used to eliminate the overload of function call and cleanup
the code

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/async_pf.c |  2 +-
 virt/kvm/kvm_main.c | 13 +------------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 79722782d9d7..56f553391896 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -203,7 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu)
 	if (!work)
 		return -ENOMEM;
 
-	work->page = get_bad_page();
+	work->page = KVM_ERR_PTR_BAD_PAGE;
 	INIT_LIST_HEAD(&work->queue); /* for list_del to work */
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eb73e5f13678..93d3c6e063c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -922,17 +922,6 @@ void kvm_disable_largepages(void)
 }
 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
-int is_error_page(struct page *page)
-{
-	return IS_ERR(page);
-}
-EXPORT_SYMBOL_GPL(is_error_page);
-
-struct page *get_bad_page(void)
-{
-	return ERR_PTR(-ENOENT);
-}
-
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -1179,7 +1168,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
 	WARN_ON(kvm_is_mmio_pfn(pfn));
 
 	if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn))
-		return get_bad_page();
+		return KVM_ERR_PTR_BAD_PAGE;
 
 	return pfn_to_page(pfn);
 }
-- 
cgit v1.2.1


From cb9aaa30b133574b646d9d4766ef08a843211393 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:42:10 +0800
Subject: KVM: do not release the error pfn

After commit a2766325cf9f9, the error pfn is replaced by the
error code, it need not be released anymore

[ The patch has been compiling tested for powerpc ]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/iommu.c    |  1 -
 virt/kvm/kvm_main.c | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 6a67bea4019c..037cb6730e68 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -107,7 +107,6 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 		 */
 		pfn = kvm_pin_pages(slot, gfn, page_size);
 		if (is_error_pfn(pfn)) {
-			kvm_release_pfn_clean(pfn);
 			gfn += 1;
 			continue;
 		}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 93d3c6e063c8..eafba99d1070 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -102,9 +102,6 @@ static bool largepages_enabled = true;
 
 bool kvm_is_mmio_pfn(pfn_t pfn)
 {
-	if (is_error_pfn(pfn))
-		return false;
-
 	if (pfn_valid(pfn)) {
 		int reserved;
 		struct page *tail = pfn_to_page(pfn);
@@ -1165,10 +1162,13 @@ EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
 
 static struct page *kvm_pfn_to_page(pfn_t pfn)
 {
-	WARN_ON(kvm_is_mmio_pfn(pfn));
+	if (is_error_pfn(pfn))
+		return KVM_ERR_PTR_BAD_PAGE;
 
-	if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn))
+	if (kvm_is_mmio_pfn(pfn)) {
+		WARN_ON(1);
 		return KVM_ERR_PTR_BAD_PAGE;
+	}
 
 	return pfn_to_page(pfn);
 }
@@ -1193,7 +1193,9 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+	WARN_ON(is_error_pfn(pfn));
+
+	if (!kvm_is_mmio_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
-- 
cgit v1.2.1


From 32cad84f44d186654492f1a50a1424c8906ccbd9 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Fri, 3 Aug 2012 15:42:52 +0800
Subject: KVM: do not release the error page

After commit a2766325cf9f9, the error page is replaced by the
error code, it need not be released anymore

[ The patch has been compiling tested for powerpc ]

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/async_pf.c | 4 ++--
 virt/kvm/kvm_main.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 56f553391896..ea475cd03511 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -111,7 +111,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
 			list_entry(vcpu->async_pf.done.next,
 				   typeof(*work), link);
 		list_del(&work->link);
-		if (work->page)
+		if (!is_error_page(work->page))
 			kvm_release_page_clean(work->page);
 		kmem_cache_free(async_pf_cache, work);
 	}
@@ -138,7 +138,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
 
 		list_del(&work->queue);
 		vcpu->async_pf.queued--;
-		if (work->page)
+		if (!is_error_page(work->page))
 			kvm_release_page_clean(work->page);
 		kmem_cache_free(async_pf_cache, work);
 	}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eafba99d1070..a2e85af847c1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1186,8 +1186,9 @@ EXPORT_SYMBOL_GPL(gfn_to_page);
 
 void kvm_release_page_clean(struct page *page)
 {
-	if (!is_error_page(page))
-		kvm_release_pfn_clean(page_to_pfn(page));
+	WARN_ON(is_error_page(page));
+
+	kvm_release_pfn_clean(page_to_pfn(page));
 }
 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
-- 
cgit v1.2.1


From 28a6fdabb3ea775d3d707afd9d2728b3ced2c34d Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 14 Aug 2012 19:20:28 +0300
Subject: KVM: x86: drop parameter validation in ioapic/pic

We validate irq pin number when routing is setup, so
code handling illegal irq # in pic and ioapic on each injection
is never called.
Drop it, replace with BUG_ON to catch out of bounds access bugs.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/ioapic.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ef61d529a6c4..cfb7e4d52dc2 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -197,28 +197,29 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 	u32 old_irr;
 	u32 mask = 1 << irq;
 	union kvm_ioapic_redirect_entry entry;
-	int ret = 1;
+	int ret, irq_level;
+
+	BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
 
 	spin_lock(&ioapic->lock);
 	old_irr = ioapic->irr;
-	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
-		int irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
-						     irq_source_id, level);
-		entry = ioapic->redirtbl[irq];
-		irq_level ^= entry.fields.polarity;
-		if (!irq_level)
-			ioapic->irr &= ~mask;
-		else {
-			int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
-			ioapic->irr |= mask;
-			if ((edge && old_irr != ioapic->irr) ||
-			    (!edge && !entry.fields.remote_irr))
-				ret = ioapic_service(ioapic, irq);
-			else
-				ret = 0; /* report coalesced interrupt */
-		}
-		trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+	irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+					 irq_source_id, level);
+	entry = ioapic->redirtbl[irq];
+	irq_level ^= entry.fields.polarity;
+	if (!irq_level) {
+		ioapic->irr &= ~mask;
+		ret = 1;
+	} else {
+		int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+		ioapic->irr |= mask;
+		if ((edge && old_irr != ioapic->irr) ||
+		    (!edge && !entry.fields.remote_irr))
+			ret = ioapic_service(ioapic, irq);
+		else
+			ret = 0; /* report coalesced interrupt */
 	}
+	trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
 	spin_unlock(&ioapic->lock);
 
 	return ret;
-- 
cgit v1.2.1


From a50d64d6590f470b712758598a077ef4fd0fc40a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 10:58:13 +0800
Subject: KVM: fix missing check for memslot flags

Check flags when memslot is registered from userspace as Avi's suggestion

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a2e85af847c1..7b94d70a323f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -678,6 +678,14 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 	slots->generation++;
 }
 
+static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+{
+	if (mem->flags & ~KVM_MEM_LOG_DIRTY_PAGES)
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -698,6 +706,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	struct kvm_memory_slot old, new;
 	struct kvm_memslots *slots, *old_memslots;
 
+	r = check_memory_region_flags(mem);
+	if (r)
+		goto out;
+
 	r = -EINVAL;
 	/* General sanity checks */
 	if (mem->memory_size & (PAGE_SIZE - 1))
-- 
cgit v1.2.1


From 037d92dc5d4691ae7cf44699c55ca83b1b441992 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 10:59:12 +0800
Subject: KVM: introduce gfn_to_pfn_memslot_atomic

It can instead of hva_to_pfn_atomic

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 7b94d70a323f..543f9b7e5aa2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1102,12 +1102,6 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	return pfn;
 }
 
-pfn_t hva_to_pfn_atomic(unsigned long addr)
-{
-	return hva_to_pfn(addr, true, NULL, true, NULL);
-}
-EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
-
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 			  bool write_fault, bool *writable)
 {
@@ -1155,6 +1149,14 @@ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 	return hva_to_pfn(addr, false, NULL, true, NULL);
 }
 
+pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
+
+	return hva_to_pfn(addr, true, NULL, true, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
+
 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
 								  int nr_pages)
 {
-- 
cgit v1.2.1


From 86ab8cffb498077e926957f099b064db3751c1de Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 10:59:53 +0800
Subject: KVM: introduce gfn_to_hva_read/kvm_read_hva/kvm_read_hva_atomic

This set of functions is only used to read data from host space, in the
later patch, we will only get a readonly hva in gfn_to_hva_read, and
the function name is a good hint to let gfn_to_hva_read to pair with
kvm_read_hva()/kvm_read_hva_atomic()

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 543f9b7e5aa2..6e3ea15397d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1002,6 +1002,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_hva);
 
+/*
+ * The hva returned by this function is only allowed to be read.
+ * It should pair with kvm_read_hva() or kvm_read_hva_atomic().
+ */
+static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
+{
+	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+}
+
+static int kvm_read_hva(void *data, void __user *hva, int len)
+{
+	return __copy_from_user(data, hva, len);
+}
+
+static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
+{
+	return __copy_from_user_inatomic(data, hva, len);
+}
+
 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	unsigned long start, int write, struct page **page)
 {
@@ -1274,10 +1293,10 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 	int r;
 	unsigned long addr;
 
-	addr = gfn_to_hva(kvm, gfn);
+	addr = gfn_to_hva_read(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
-	r = __copy_from_user(data, (void __user *)addr + offset, len);
+	r = kvm_read_hva(data, (void __user *)addr + offset, len);
 	if (r)
 		return -EFAULT;
 	return 0;
@@ -1312,11 +1331,11 @@ int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	int offset = offset_in_page(gpa);
 
-	addr = gfn_to_hva(kvm, gfn);
+	addr = gfn_to_hva_read(kvm, gfn);
 	if (kvm_is_error_hva(addr))
 		return -EFAULT;
 	pagefault_disable();
-	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
+	r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len);
 	pagefault_enable();
 	if (r)
 		return -EFAULT;
-- 
cgit v1.2.1


From 2fc843117d64b31c20d36c6b625c0626c9a41a41 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 11:00:22 +0800
Subject: KVM: reorganize hva_to_pfn

We do too many things in hva_to_pfn, this patch reorganize the code,
let it be better readable

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 159 ++++++++++++++++++++++++++++++++--------------------
 1 file changed, 97 insertions(+), 62 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6e3ea15397d4..aa4a38ad9d9a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1041,83 +1041,118 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 	return rc == -EHWPOISON;
 }
 
-static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-			bool write_fault, bool *writable)
+/*
+ * The atomic path to get the writable pfn which will be stored in @pfn,
+ * true indicates success, otherwise false is returned.
+ */
+static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
+			    bool write_fault, bool *writable, pfn_t *pfn)
 {
 	struct page *page[1];
-	int npages = 0;
-	pfn_t pfn;
+	int npages;
 
-	/* we can do it either atomically or asynchronously, not both */
-	BUG_ON(atomic && async);
+	if (!(async || atomic))
+		return false;
 
-	BUG_ON(!write_fault && !writable);
+	npages = __get_user_pages_fast(addr, 1, 1, page);
+	if (npages == 1) {
+		*pfn = page_to_pfn(page[0]);
 
-	if (writable)
-		*writable = true;
+		if (writable)
+			*writable = true;
+		return true;
+	}
+
+	return false;
+}
 
-	if (atomic || async)
-		npages = __get_user_pages_fast(addr, 1, 1, page);
+/*
+ * The slow path to get the pfn of the specified host virtual address,
+ * 1 indicates success, -errno is returned if error is detected.
+ */
+static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
+			   bool *writable, pfn_t *pfn)
+{
+	struct page *page[1];
+	int npages = 0;
 
-	if (unlikely(npages != 1) && !atomic) {
-		might_sleep();
+	might_sleep();
 
-		if (writable)
-			*writable = write_fault;
-
-		if (async) {
-			down_read(&current->mm->mmap_sem);
-			npages = get_user_page_nowait(current, current->mm,
-						     addr, write_fault, page);
-			up_read(&current->mm->mmap_sem);
-		} else
-			npages = get_user_pages_fast(addr, 1, write_fault,
-						     page);
-
-		/* map read fault as writable if possible */
-		if (unlikely(!write_fault) && npages == 1) {
-			struct page *wpage[1];
-
-			npages = __get_user_pages_fast(addr, 1, 1, wpage);
-			if (npages == 1) {
-				*writable = true;
-				put_page(page[0]);
-				page[0] = wpage[0];
-			}
-			npages = 1;
+	if (writable)
+		*writable = write_fault;
+
+	if (async) {
+		down_read(&current->mm->mmap_sem);
+		npages = get_user_page_nowait(current, current->mm,
+					      addr, write_fault, page);
+		up_read(&current->mm->mmap_sem);
+	} else
+		npages = get_user_pages_fast(addr, 1, write_fault,
+					     page);
+	if (npages != 1)
+		return npages;
+
+	/* map read fault as writable if possible */
+	if (unlikely(!write_fault)) {
+		struct page *wpage[1];
+
+		npages = __get_user_pages_fast(addr, 1, 1, wpage);
+		if (npages == 1) {
+			*writable = true;
+			put_page(page[0]);
+			page[0] = wpage[0];
 		}
+
+		npages = 1;
 	}
+	*pfn = page_to_pfn(page[0]);
+	return npages;
+}
+
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+			bool write_fault, bool *writable)
+{
+	struct vm_area_struct *vma;
+	pfn_t pfn = 0;
+	int npages;
 
-	if (unlikely(npages != 1)) {
-		struct vm_area_struct *vma;
+	/* we can do it either atomically or asynchronously, not both */
+	BUG_ON(atomic && async);
 
-		if (atomic)
-			return KVM_PFN_ERR_FAULT;
+	BUG_ON(!write_fault && !writable);
 
-		down_read(&current->mm->mmap_sem);
-		if (npages == -EHWPOISON ||
-			(!async && check_user_page_hwpoison(addr))) {
-			up_read(&current->mm->mmap_sem);
-			return KVM_PFN_ERR_HWPOISON;
-		}
+	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
+		return pfn;
 
-		vma = find_vma_intersection(current->mm, addr, addr+1);
-
-		if (vma == NULL)
-			pfn = KVM_PFN_ERR_FAULT;
-		else if ((vma->vm_flags & VM_PFNMAP)) {
-			pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
-				vma->vm_pgoff;
-			BUG_ON(!kvm_is_mmio_pfn(pfn));
-		} else {
-			if (async && (vma->vm_flags & VM_WRITE))
-				*async = true;
-			pfn = KVM_PFN_ERR_FAULT;
-		}
-		up_read(&current->mm->mmap_sem);
-	} else
-		pfn = page_to_pfn(page[0]);
+	if (atomic)
+		return KVM_PFN_ERR_FAULT;
 
+	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
+	if (npages == 1)
+		return pfn;
+
+	down_read(&current->mm->mmap_sem);
+	if (npages == -EHWPOISON ||
+	      (!async && check_user_page_hwpoison(addr))) {
+		pfn = KVM_PFN_ERR_HWPOISON;
+		goto exit;
+	}
+
+	vma = find_vma_intersection(current->mm, addr, addr + 1);
+
+	if (vma == NULL)
+		pfn = KVM_PFN_ERR_FAULT;
+	else if ((vma->vm_flags & VM_PFNMAP)) {
+		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+			vma->vm_pgoff;
+		BUG_ON(!kvm_is_mmio_pfn(pfn));
+	} else {
+		if (async && (vma->vm_flags & VM_WRITE))
+			*async = true;
+		pfn = KVM_PFN_ERR_FAULT;
+	}
+exit:
+	up_read(&current->mm->mmap_sem);
 	return pfn;
 }
 
-- 
cgit v1.2.1


From 12ce13fea925ab629f2281be2b9b01cf647a86cd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 11:00:49 +0800
Subject: KVM: use 'writable' as a hint to map writable pfn

In current code, we always map writable pfn for the read fault, in order
to support readonly memslot, we map writable pfn only if 'writable'
is not NULL

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aa4a38ad9d9a..c89d1b5129e4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1054,6 +1054,14 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
 	if (!(async || atomic))
 		return false;
 
+	/*
+	 * Fast pin a writable pfn only if it is a write fault request
+	 * or the caller allows to map a writable pfn for a read fault
+	 * request.
+	 */
+	if (!(write_fault || writable))
+		return false;
+
 	npages = __get_user_pages_fast(addr, 1, 1, page);
 	if (npages == 1) {
 		*pfn = page_to_pfn(page[0]);
@@ -1093,7 +1101,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		return npages;
 
 	/* map read fault as writable if possible */
-	if (unlikely(!write_fault)) {
+	if (unlikely(!write_fault) && writable) {
 		struct page *wpage[1];
 
 		npages = __get_user_pages_fast(addr, 1, 1, wpage);
@@ -1109,6 +1117,20 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 	return npages;
 }
 
+/*
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ *         host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ *     whether the mapping is writable.
+ */
 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 			bool write_fault, bool *writable)
 {
-- 
cgit v1.2.1


From ca3a490c7de8472b514e2d635c052b1e0f8e1c9d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 11:01:50 +0800
Subject: KVM: introduce KVM_HVA_ERR_BAD

Then, remove bad_hva and inline kvm_is_error_hva

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c89d1b5129e4..e3e1658c491d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -931,17 +931,6 @@ void kvm_disable_largepages(void)
 }
 EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
-static inline unsigned long bad_hva(void)
-{
-	return PAGE_OFFSET;
-}
-
-int kvm_is_error_hva(unsigned long addr)
-{
-	return addr == bad_hva();
-}
-EXPORT_SYMBOL_GPL(kvm_is_error_hva);
-
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	return __gfn_to_memslot(kvm_memslots(kvm), gfn);
@@ -988,7 +977,7 @@ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				     gfn_t *nr_pages)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
-		return bad_hva();
+		return KVM_HVA_ERR_BAD;
 
 	if (nr_pages)
 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
-- 
cgit v1.2.1


From 4d8b81abc47b83a1939e59df2fdb0e98dfe0eedd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Date: Tue, 21 Aug 2012 11:02:51 +0800
Subject: KVM: introduce readonly memslot

In current code, if we map a readonly memory space from host to guest
and the page is not currently mapped in the host, we will get a fault
pfn and async is not allowed, then the vm will crash

We introduce readonly memory region to map ROM/ROMD to the guest, read access
is happy for readonly memslot, write access on readonly memslot will cause
KVM_EXIT_MMIO exit

Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 96 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 18 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e3e1658c491d..3416f8a31f63 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -680,7 +680,13 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new)
 
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
 {
-	if (mem->flags & ~KVM_MEM_LOG_DIRTY_PAGES)
+	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+
+#ifdef KVM_CAP_READONLY_MEM
+	valid_flags |= KVM_MEM_READONLY;
+#endif
+
+	if (mem->flags & ~valid_flags)
 		return -EINVAL;
 
 	return 0;
@@ -973,18 +979,45 @@ out:
 	return size;
 }
 
-static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
-				     gfn_t *nr_pages)
+static bool memslot_is_readonly(struct kvm_memory_slot *slot)
+{
+	return slot->flags & KVM_MEM_READONLY;
+}
+
+static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+					  gfn_t gfn)
+{
+	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
+static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+				       gfn_t *nr_pages, bool write)
 {
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return KVM_HVA_ERR_BAD;
 
+	if (memslot_is_readonly(slot) && write)
+		return KVM_HVA_ERR_RO_BAD;
+
 	if (nr_pages)
 		*nr_pages = slot->npages - (gfn - slot->base_gfn);
 
-	return gfn_to_hva_memslot(slot, gfn);
+	return __gfn_to_hva_memslot(slot, gfn);
 }
 
+static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
+				     gfn_t *nr_pages)
+{
+	return __gfn_to_hva_many(slot, gfn, nr_pages, true);
+}
+
+unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+				 gfn_t gfn)
+{
+	return gfn_to_hva_many(slot, gfn, NULL);
+}
+EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
+
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
@@ -997,7 +1030,7 @@ EXPORT_SYMBOL_GPL(gfn_to_hva);
  */
 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn)
 {
-	return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
+	return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false);
 }
 
 static int kvm_read_hva(void *data, void __user *hva, int len)
@@ -1106,6 +1139,17 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 	return npages;
 }
 
+static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
+{
+	if (unlikely(!(vma->vm_flags & VM_READ)))
+		return false;
+
+	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
+		return false;
+
+	return true;
+}
+
 /*
  * Pin guest page in memory and return its pfn.
  * @addr: host virtual address which maps memory to the guest
@@ -1130,8 +1174,6 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 
-	BUG_ON(!write_fault && !writable);
-
 	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
 		return pfn;
 
@@ -1158,7 +1200,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 			vma->vm_pgoff;
 		BUG_ON(!kvm_is_mmio_pfn(pfn));
 	} else {
-		if (async && (vma->vm_flags & VM_WRITE))
+		if (async && vma_is_valid(vma, write_fault))
 			*async = true;
 		pfn = KVM_PFN_ERR_FAULT;
 	}
@@ -1167,19 +1209,40 @@ exit:
 	return pfn;
 }
 
+static pfn_t
+__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic,
+		     bool *async, bool write_fault, bool *writable)
+{
+	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault);
+
+	if (addr == KVM_HVA_ERR_RO_BAD)
+		return KVM_PFN_ERR_RO_FAULT;
+
+	if (kvm_is_error_hva(addr))
+		return KVM_PFN_ERR_BAD;
+
+	/* Do not map writable pfn in the readonly memslot. */
+	if (writable && memslot_is_readonly(slot)) {
+		*writable = false;
+		writable = NULL;
+	}
+
+	return hva_to_pfn(addr, atomic, async, write_fault,
+			  writable);
+}
+
 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async,
 			  bool write_fault, bool *writable)
 {
-	unsigned long addr;
+	struct kvm_memory_slot *slot;
 
 	if (async)
 		*async = false;
 
-	addr = gfn_to_hva(kvm, gfn);
-	if (kvm_is_error_hva(addr))
-		return KVM_PFN_ERR_BAD;
+	slot = gfn_to_memslot(kvm, gfn);
 
-	return hva_to_pfn(addr, atomic, async, write_fault, writable);
+	return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault,
+				    writable);
 }
 
 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
@@ -1210,15 +1273,12 @@ EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
 
 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-	return hva_to_pfn(addr, false, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL);
 }
 
 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn)
 {
-	unsigned long addr = gfn_to_hva_memslot(slot, gfn);
-
-	return hva_to_pfn(addr, true, NULL, true, NULL);
+	return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL);
 }
 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
 
-- 
cgit v1.2.1


From 66a03505a7fcc70187319ef2318832f4d3c451a6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Fri, 24 Aug 2012 16:50:28 +0800
Subject: KVM: PPC: book3s: fix build error caused by gfn_to_hva_memslot()

The build error was caused by that builtin functions are calling
the functions implemented in modules. This error was introduced by
commit 4d8b81abc4 ("KVM: introduce readonly memslot").

The patch fixes the build error by moving function __gfn_to_hva_memslot()
from kvm_main.c to kvm_host.h and making that "inline" so that the
builtin function (kvmppc_h_enter) can use that.

Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3416f8a31f63..6425906d7cec 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -984,12 +984,6 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
 	return slot->flags & KVM_MEM_READONLY;
 }
 
-static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot,
-					  gfn_t gfn)
-{
-	return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
 				       gfn_t *nr_pages, bool write)
 {
-- 
cgit v1.2.1


From 2df72e9bc4c505d8357012f2924589f3d16f9d44 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 24 Aug 2012 15:54:57 -0300
Subject: KVM: split kvm_arch_flush_shadow

Introducing kvm_arch_flush_shadow_memslot, to invalidate the
translations of a single memory slot.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6425906d7cec..a4bf05be5fea 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -408,7 +408,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
 	int idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	kvm_arch_flush_shadow(kvm);
+	kvm_arch_flush_shadow_all(kvm);
 	srcu_read_unlock(&kvm->srcu, idx);
 }
 
@@ -582,7 +582,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
 #else
-	kvm_arch_flush_shadow(kvm);
+	kvm_arch_flush_shadow_all(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
 	kvm_free_physmem(kvm);
@@ -814,7 +814,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
 		 * 	- kvm_is_visible_gfn (mmu_check_roots)
 		 */
-		kvm_arch_flush_shadow(kvm);
+		kvm_arch_flush_shadow_memslot(kvm, slot);
 		kfree(old_memslots);
 	}
 
@@ -854,7 +854,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	 * mmio sptes.
 	 */
 	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
-		kvm_arch_flush_shadow(kvm);
+		kvm_arch_flush_shadow_all(kvm);
 
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
-- 
cgit v1.2.1


From 12d6e7538e2d418c08f082b1b44ffa5fb7270ed8 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 24 Aug 2012 15:54:58 -0300
Subject: KVM: perform an invalid memslot step for gpa base change

PPC must flush all translations before the new memory slot
is visible.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a4bf05be5fea..f41ea1262d51 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -791,7 +791,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		/* destroy any largepage mappings for dirty tracking */
 	}
 
-	if (!npages) {
+	if (!npages || base_gfn != old.base_gfn) {
 		struct kvm_memory_slot *slot;
 
 		r = -ENOMEM;
@@ -807,8 +807,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		old_memslots = kvm->memslots;
 		rcu_assign_pointer(kvm->memslots, slots);
 		synchronize_srcu_expedited(&kvm->srcu);
-		/* From this point no new shadow pages pointing to a deleted
-		 * memslot will be created.
+		/* From this point no new shadow pages pointing to a deleted,
+		 * or moved, memslot will be created.
 		 *
 		 * validation of sp->gfn happens in:
 		 * 	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-- 
cgit v1.2.1


From 3b4dc3a031110753b9ba36432dbd21f989fcee56 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 28 Aug 2012 17:43:26 -0300
Subject: KVM: move postcommit flush to x86, as mmio sptes are x86 specific

Other arches do not need this.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

v2: fix incorrect deletion of mmio sptes on gpa move (noticed by Takuya)
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f41ea1262d51..4fe02d900810 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -849,13 +849,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
 
-	/*
-	 * If the new memory slot is created, we need to clear all
-	 * mmio sptes.
-	 */
-	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
-		kvm_arch_flush_shadow_all(kvm);
-
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
 
-- 
cgit v1.2.1


From 9fc77441e5e1bf80b794cc546d2243ee9f4afb75 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 16 Sep 2012 11:50:30 +0300
Subject: KVM: make processes waiting on vcpu mutex killable

vcpu mutex can be held for unlimited time so
taking it with mutex_lock on an ioctl is wrong:
one process could be passed a vcpu fd and
call this ioctl on the vcpu used by another process,
it will then be unkillable until the owner exits.

Call mutex_lock_killable instead and return status.
Note: mutex_lock_interruptible would be even nicer,
but I am not sure all users are prepared to handle EINTR
from these ioctls. They might misinterpret it as an error.

Cleanup paths expect a vcpu that can't be used by
any userspace so this will always succeed - catch bugs
by calling BUG_ON.

Catch callers that don't check return state by adding
__must_check.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4fe02d900810..cc3f6dc506e4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -131,11 +131,12 @@ bool kvm_is_mmio_pfn(pfn_t pfn)
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
-void vcpu_load(struct kvm_vcpu *vcpu)
+int vcpu_load(struct kvm_vcpu *vcpu)
 {
 	int cpu;
 
-	mutex_lock(&vcpu->mutex);
+	if (mutex_lock_killable(&vcpu->mutex))
+		return -EINTR;
 	if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
 		/* The thread running this VCPU changed. */
 		struct pid *oldpid = vcpu->pid;
@@ -148,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu)
 	preempt_notifier_register(&vcpu->preempt_notifier);
 	kvm_arch_vcpu_load(vcpu, cpu);
 	put_cpu();
+	return 0;
 }
 
 void vcpu_put(struct kvm_vcpu *vcpu)
@@ -1891,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp,
 #endif
 
 
-	vcpu_load(vcpu);
+	r = vcpu_load(vcpu);
+	if (r)
+		return r;
 	switch (ioctl) {
 	case KVM_RUN:
 		r = -EINVAL;
-- 
cgit v1.2.1


From 1e08ec4a130e2745d96df169e67c58df98a07311 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 13 Sep 2012 17:19:24 +0300
Subject: KVM: optimize apic interrupt delivery

Most interrupt are delivered to only one vcpu. Use pre-build tables to
find interrupt destination instead of looping through all vcpus. In case
of logical mode loop only through vcpus in a logical cluster irq is sent
to.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/irq_comm.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'virt')

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 7118be0f2f2c..3ca89c451d6b 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -68,8 +68,13 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-			kvm_is_dm_lowest_prio(irq))
+			kvm_is_dm_lowest_prio(irq)) {
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+		irq->delivery_mode = APIC_DM_FIXED;
+	}
+
+	if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+		return r;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!kvm_apic_present(vcpu))
-- 
cgit v1.2.1


From 7a84428af7ca6a847f058c9ff244a18a2664fd1b Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Sep 2012 11:58:03 -0600
Subject: KVM: Add resampling irqfds for level triggered interrupts

To emulate level triggered interrupts, add a resample option to
KVM_IRQFD.  When specified, a new resamplefd is provided that notifies
the user when the irqchip has been resampled by the VM.  This may, for
instance, indicate an EOI.  Also in this mode, posting of an interrupt
through an irqfd only asserts the interrupt.  On resampling, the
interrupt is automatically de-asserted prior to user notification.
This enables level triggered interrupts to be posted and re-enabled
from vfio with no userspace intervention.

All resampling irqfds can make use of a single irq source ID, so we
reserve a new one for this interface.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/eventfd.c  | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 virt/kvm/irq_comm.c |   6 +++
 2 files changed, 152 insertions(+), 4 deletions(-)

(limited to 'virt')

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 7d7e2aaffece..356965c9d107 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -43,6 +43,31 @@
  * --------------------------------------------------------------------
  */
 
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts.  The interrupt is asserted on eventfd
+ * trigger.  On acknowledgement through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd.  All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user.  We can also therefore share the same irq source ID.
+ */
+struct _irqfd_resampler {
+	struct kvm *kvm;
+	/*
+	 * List of resampling struct _irqfd objects sharing this gsi.
+	 * RCU list modified under kvm->irqfds.resampler_lock
+	 */
+	struct list_head list;
+	struct kvm_irq_ack_notifier notifier;
+	/*
+	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
+	 * resamplers among irqfds on the same gsi.
+	 * Accessed and modified under kvm->irqfds.resampler_lock
+	 */
+	struct list_head link;
+};
+
 struct _irqfd {
 	/* Used for MSI fast-path */
 	struct kvm *kvm;
@@ -52,6 +77,12 @@ struct _irqfd {
 	/* Used for level IRQ fast-path */
 	int gsi;
 	struct work_struct inject;
+	/* The resampler used by this irqfd (resampler-only) */
+	struct _irqfd_resampler *resampler;
+	/* Eventfd notified on resample (resampler-only) */
+	struct eventfd_ctx *resamplefd;
+	/* Entry in list of irqfds for a resampler (resampler-only) */
+	struct list_head resampler_link;
 	/* Used for setup/shutdown */
 	struct eventfd_ctx *eventfd;
 	struct list_head list;
@@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work)
 	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;
 
-	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-	kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	if (!irqfd->resampler) {
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+	} else
+		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+			    irqfd->gsi, 1);
+}
+
+/*
+ * Since resampler irqfds share an IRQ source ID, we de-assert once
+ * then notify all of the resampler irqfds using this GSI.  We can't
+ * do multiple de-asserts or we risk racing with incoming re-asserts.
+ */
+static void
+irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
+{
+	struct _irqfd_resampler *resampler;
+	struct _irqfd *irqfd;
+
+	resampler = container_of(kian, struct _irqfd_resampler, notifier);
+
+	kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+		    resampler->notifier.gsi, 0);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link)
+		eventfd_signal(irqfd->resamplefd, 1);
+
+	rcu_read_unlock();
+}
+
+static void
+irqfd_resampler_shutdown(struct _irqfd *irqfd)
+{
+	struct _irqfd_resampler *resampler = irqfd->resampler;
+	struct kvm *kvm = resampler->kvm;
+
+	mutex_lock(&kvm->irqfds.resampler_lock);
+
+	list_del_rcu(&irqfd->resampler_link);
+	synchronize_rcu();
+
+	if (list_empty(&resampler->list)) {
+		list_del(&resampler->link);
+		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
+		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
+			    resampler->notifier.gsi, 0);
+		kfree(resampler);
+	}
+
+	mutex_unlock(&kvm->irqfds.resampler_lock);
 }
 
 /*
@@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work)
 	 */
 	flush_work_sync(&irqfd->inject);
 
+	if (irqfd->resampler) {
+		irqfd_resampler_shutdown(irqfd);
+		eventfd_ctx_put(irqfd->resamplefd);
+	}
+
 	/*
 	 * It is now safe to release the object's resources
 	 */
@@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	struct kvm_irq_routing_table *irq_rt;
 	struct _irqfd *irqfd, *tmp;
 	struct file *file = NULL;
-	struct eventfd_ctx *eventfd = NULL;
+	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	int ret;
 	unsigned int events;
 
@@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 
 	irqfd->eventfd = eventfd;
 
+	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
+		struct _irqfd_resampler *resampler;
+
+		resamplefd = eventfd_ctx_fdget(args->resamplefd);
+		if (IS_ERR(resamplefd)) {
+			ret = PTR_ERR(resamplefd);
+			goto fail;
+		}
+
+		irqfd->resamplefd = resamplefd;
+		INIT_LIST_HEAD(&irqfd->resampler_link);
+
+		mutex_lock(&kvm->irqfds.resampler_lock);
+
+		list_for_each_entry(resampler,
+				    &kvm->irqfds.resampler_list, list) {
+			if (resampler->notifier.gsi == irqfd->gsi) {
+				irqfd->resampler = resampler;
+				break;
+			}
+		}
+
+		if (!irqfd->resampler) {
+			resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+			if (!resampler) {
+				ret = -ENOMEM;
+				mutex_unlock(&kvm->irqfds.resampler_lock);
+				goto fail;
+			}
+
+			resampler->kvm = kvm;
+			INIT_LIST_HEAD(&resampler->list);
+			resampler->notifier.gsi = irqfd->gsi;
+			resampler->notifier.irq_acked = irqfd_resampler_ack;
+			INIT_LIST_HEAD(&resampler->link);
+
+			list_add(&resampler->link, &kvm->irqfds.resampler_list);
+			kvm_register_irq_ack_notifier(kvm,
+						      &resampler->notifier);
+			irqfd->resampler = resampler;
+		}
+
+		list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
+		synchronize_rcu();
+
+		mutex_unlock(&kvm->irqfds.resampler_lock);
+	}
+
 	/*
 	 * Install our own custom wake-up handling so we are notified via
 	 * a callback whenever someone signals the underlying eventfd
@@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	return 0;
 
 fail:
+	if (irqfd->resampler)
+		irqfd_resampler_shutdown(irqfd);
+
+	if (resamplefd && !IS_ERR(resamplefd))
+		eventfd_ctx_put(resamplefd);
+
 	if (eventfd && !IS_ERR(eventfd))
 		eventfd_ctx_put(eventfd);
 
@@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm)
 {
 	spin_lock_init(&kvm->irqfds.lock);
 	INIT_LIST_HEAD(&kvm->irqfds.items);
+	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
+	mutex_init(&kvm->irqfds.resampler_lock);
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
@@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 int
 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
-	if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN)
+	if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
 		return -EINVAL;
 
 	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 3ca89c451d6b..2eb58af7ee99 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -228,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm)
 	}
 
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
 	set_bit(irq_source_id, bitmap);
 unlock:
 	mutex_unlock(&kvm->irq_lock);
@@ -238,6 +241,9 @@ unlock:
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
 	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+#ifdef CONFIG_X86
+	ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+#endif
 
 	mutex_lock(&kvm->irq_lock);
 	if (irq_source_id < 0 ||
-- 
cgit v1.2.1