summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kvm/book3s_xive.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kvm/book3s_xive.c')
-rw-r--r--arch/powerpc/kvm/book3s_xive.c480
1 files changed, 362 insertions, 118 deletions
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index f78d002f0fe0..66858b7d3c6b 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) "xive-kvm: " fmt
@@ -70,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
- if (!tima)
+ /*
+ * Nothing to do if the platform doesn't have a XIVE
+ * or this vCPU doesn't have its own XIVE context
+ * (e.g. because it's not using an in-kernel interrupt controller).
+ */
+ if (!tima || !vcpu->arch.xive_cam_word)
return;
+
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
@@ -163,10 +166,14 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
-static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
+int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
+ bool single_escalation)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct xive_q *q = &xc->queues[prio];
@@ -185,7 +192,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
return -EIO;
}
- if (xc->xive->single_escalation)
+ if (single_escalation)
name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
vcpu->kvm->arch.lpid, xc->server_num);
else
@@ -217,7 +224,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
* interrupt, thus leaving it effectively masked after
* it fires once.
*/
- if (xc->xive->single_escalation) {
+ if (single_escalation) {
struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -270,14 +277,14 @@ static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
return rc;
}
-/* Called with kvm_lock held */
+/* Called with xive->lock held */
static int xive_check_provisioning(struct kvm *kvm, u8 prio)
{
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvm_vcpu *vcpu;
int i, rc;
- lockdep_assert_held(&kvm->lock);
+ lockdep_assert_held(&xive->lock);
/* Already provisioned ? */
if (xive->qmap & (1 << prio))
@@ -291,7 +298,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
continue;
rc = xive_provision_queue(vcpu, prio);
if (rc == 0 && !xive->single_escalation)
- xive_attach_escalation(vcpu, prio);
+ kvmppc_xive_attach_escalation(vcpu, prio,
+ xive->single_escalation);
if (rc)
return rc;
}
@@ -342,7 +350,7 @@ static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
}
-static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
{
struct kvm_vcpu *vcpu;
int i, rc;
@@ -380,11 +388,6 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
return -EBUSY;
}
-static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
-{
- return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
-}
-
static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
struct kvmppc_xive_src_block *sb,
struct kvmppc_xive_irq_state *state)
@@ -430,8 +433,8 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
*/
if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
xive_native_configure_irq(hw_num,
- xive_vp(xive, state->act_server),
- MASKED, state->number);
+ kvmppc_xive_vp(xive, state->act_server),
+ MASKED, state->number);
/* set old_p so we can track if an H_EOI was done */
state->old_p = true;
state->old_q = false;
@@ -486,8 +489,8 @@ static void xive_finish_unmask(struct kvmppc_xive *xive,
*/
if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
xive_native_configure_irq(hw_num,
- xive_vp(xive, state->act_server),
- state->act_priority, state->number);
+ kvmppc_xive_vp(xive, state->act_server),
+ state->act_priority, state->number);
/* If an EOI is needed, do it here */
if (!state->old_p)
xive_vm_source_eoi(hw_num, xd);
@@ -535,7 +538,7 @@ static int xive_target_interrupt(struct kvm *kvm,
* priority. The count for that new target will have
* already been incremented.
*/
- rc = xive_select_target(kvm, &server, prio);
+ rc = kvmppc_xive_select_target(kvm, &server, prio);
/*
* We failed to find a target ? Not much we can do
@@ -563,7 +566,7 @@ static int xive_target_interrupt(struct kvm *kvm,
kvmppc_xive_select_irq(state, &hw_num, NULL);
return xive_native_configure_irq(hw_num,
- xive_vp(xive, server),
+ kvmppc_xive_vp(xive, server),
prio, state->number);
}
@@ -624,9 +627,12 @@ int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
irq, server, priority);
/* First, check provisioning of queues */
- if (priority != MASKED)
+ if (priority != MASKED) {
+ mutex_lock(&xive->lock);
rc = xive_check_provisioning(xive->kvm,
xive_prio_from_guest(priority));
+ mutex_unlock(&xive->lock);
+ }
if (rc) {
pr_devel(" provisioning failure %d !\n", rc);
return rc;
@@ -849,7 +855,8 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
/*
* We can't update the state of a "pushed" VCPU, but that
- * shouldn't happen.
+ * shouldn't happen because the vcpu->mutex makes running a
+ * vcpu mutually exclusive with doing one_reg get/set on it.
*/
if (WARN_ON(vcpu->arch.xive_pushed))
return -EIO;
@@ -940,6 +947,13 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
/* Turn the IPI hard off */
xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+ /*
+ * Reset ESB guest mapping. Needed when ESB pages are exposed
+ * to the guest in XIVE native mode
+ */
+ if (xive->ops && xive->ops->reset_mapped)
+ xive->ops->reset_mapped(kvm, guest_irq);
+
/* Grab info about irq */
state->pt_number = hw_irq;
state->pt_data = irq_data_get_irq_handler_data(host_data);
@@ -951,7 +965,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
* which is fine for a never started interrupt.
*/
xive_native_configure_irq(hw_irq,
- xive_vp(xive, state->act_server),
+ kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/*
@@ -1025,9 +1039,17 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
state->pt_number = 0;
state->pt_data = NULL;
+ /*
+ * Reset ESB guest mapping. Needed when ESB pages are exposed
+ * to the guest in XIVE native mode
+ */
+ if (xive->ops && xive->ops->reset_mapped) {
+ xive->ops->reset_mapped(kvm, guest_irq);
+ }
+
/* Reconfigure the IPI */
xive_native_configure_irq(state->ipi_number,
- xive_vp(xive, state->act_server),
+ kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/*
@@ -1049,7 +1071,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
}
EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
-static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct kvm *kvm = vcpu->kvm;
@@ -1083,14 +1105,60 @@ static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
arch_spin_unlock(&sb->lock);
}
}
+
+ /* Disable vcpu's escalation interrupt */
+ if (vcpu->arch.xive_esc_on) {
+ __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+ XIVE_ESB_SET_PQ_01));
+ vcpu->arch.xive_esc_on = false;
+ }
+
+ /*
+ * Clear pointers to escalation interrupt ESB.
+ * This is safe because the vcpu->mutex is held, preventing
+ * any other CPU from concurrently executing a KVM_RUN ioctl.
+ */
+ vcpu->arch.xive_esc_vaddr = 0;
+ vcpu->arch.xive_esc_raddr = 0;
+}
+
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
}
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
- struct kvmppc_xive *xive = xc->xive;
+ struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
int i;
+ if (!kvmppc_xics_enabled(vcpu))
+ return;
+
+ if (!xc)
+ return;
+
pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
/* Ensure no interrupt is still routed to that VP */
@@ -1100,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
/* Mask the VP IPI */
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
- /* Disable the VP */
- xive_native_disable_vp(xc->vp_id);
-
- /* Free the queues & associated interrupts */
+ /* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
-
- /* Free the escalation irq */
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
}
- /* Free the queue */
+ }
+
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
+ /* Free the queues */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+
xive_native_disable_queue(xc->vp_id, q, i);
if (q->qpage) {
free_pages((unsigned long)q->qpage,
@@ -1129,6 +1205,49 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
}
/* Free the VP */
kfree(xc);
+
+ /* Cleanup the vcpu */
+ vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+ vcpu->arch.xive_vcpu = NULL;
+}
+
+static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
+{
+ /* We have a block of xive->nr_servers VPs. We just need to check
+ * raw vCPU ids are below the expected limit for this guest's
+ * core stride ; kvmppc_pack_vcpu_id() will pack them down to an
+ * index that can be safely used to compute a VP id that belongs
+ * to the VP block.
+ */
+ return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode;
+}
+
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
+{
+ u32 vp_id;
+
+ if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
+ pr_devel("Out of bounds !\n");
+ return -EINVAL;
+ }
+
+ if (xive->vp_base == XIVE_INVALID_VP) {
+ xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
+ pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers);
+
+ if (xive->vp_base == XIVE_INVALID_VP)
+ return -ENOSPC;
+ }
+
+ vp_id = kvmppc_xive_vp(xive, cpu);
+ if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+ pr_devel("Duplicate !\n");
+ return -EEXIST;
+ }
+
+ *vp = vp_id;
+
+ return 0;
}
int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
@@ -1137,6 +1256,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
struct kvmppc_xive *xive = dev->private;
struct kvmppc_xive_vcpu *xc;
int i, r = -EBUSY;
+ u32 vp_id;
pr_devel("connect_vcpu(cpu=%d)\n", cpu);
@@ -1146,27 +1266,27 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
}
if (xive->kvm != vcpu->kvm)
return -EPERM;
- if (vcpu->arch.irq_type)
+ if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
- if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
- pr_devel("Duplicate !\n");
- return -EEXIST;
- }
- if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
- pr_devel("Out of bounds !\n");
- return -EINVAL;
- }
- xc = kzalloc(sizeof(*xc), GFP_KERNEL);
- if (!xc)
- return -ENOMEM;
/* We need to synchronize with queue provisioning */
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&xive->lock);
+
+ r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id);
+ if (r)
+ goto bail;
+
+ xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+ if (!xc) {
+ r = -ENOMEM;
+ goto bail;
+ }
+
vcpu->arch.xive_vcpu = xc;
xc->xive = xive;
xc->vcpu = vcpu;
xc->server_num = cpu;
- xc->vp_id = xive_vp(xive, cpu);
+ xc->vp_id = vp_id;
xc->mfrr = 0xff;
xc->valid = true;
@@ -1219,7 +1339,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i);
if (r == 0 && !xive->single_escalation)
- xive_attach_escalation(vcpu, i);
+ kvmppc_xive_attach_escalation(
+ vcpu, i, xive->single_escalation);
if (r)
goto bail;
} else {
@@ -1234,7 +1355,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
}
/* If not done above, attach priority 0 escalation */
- r = xive_attach_escalation(vcpu, 0);
+ r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
if (r)
goto bail;
@@ -1244,7 +1365,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
bail:
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&xive->lock);
if (r) {
kvmppc_xive_cleanup_vcpu(vcpu);
return r;
@@ -1485,16 +1606,15 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
return 0;
}
-static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
- int irq)
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+ struct kvmppc_xive *xive, int irq)
{
- struct kvm *kvm = xive->kvm;
struct kvmppc_xive_src_block *sb;
int i, bid;
bid = irq >> KVMPPC_XICS_ICS_SHIFT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&xive->lock);
/* block already exists - somebody else got here first */
if (xive->src_blocks[bid])
@@ -1509,6 +1629,7 @@ static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *x
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+ sb->irq_state[i].eisn = 0;
sb->irq_state[i].guest_priority = MASKED;
sb->irq_state[i].saved_priority = MASKED;
sb->irq_state[i].act_priority = MASKED;
@@ -1520,7 +1641,7 @@ static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *x
xive->max_sbid = bid;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&xive->lock);
return xive->src_blocks[bid];
}
@@ -1565,7 +1686,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
sb = kvmppc_xive_find_source(xive, irq, &idx);
if (!sb) {
pr_devel("No source, creating source block...\n");
- sb = xive_create_src_block(xive, irq);
+ sb = kvmppc_xive_create_src_block(xive, irq);
if (!sb) {
pr_devel("Failed to create block...\n");
return -ENOMEM;
@@ -1630,9 +1751,9 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
/* If we have a priority target the interrupt */
if (act_prio != MASKED) {
/* First, check provisioning of queues */
- mutex_lock(&xive->kvm->lock);
+ mutex_lock(&xive->lock);
rc = xive_check_provisioning(xive->kvm, act_prio);
- mutex_unlock(&xive->kvm->lock);
+ mutex_unlock(&xive->lock);
/* Target interrupt */
if (rc == 0)
@@ -1745,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
return 0;
}
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
+{
+ u32 __user *ubufp = (u32 __user *) addr;
+ u32 nr_servers;
+ int rc = 0;
+
+ if (get_user(nr_servers, ubufp))
+ return -EFAULT;
+
+ pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
+
+ if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID)
+ return -EINVAL;
+
+ mutex_lock(&xive->lock);
+ if (xive->vp_base != XIVE_INVALID_VP)
+ /* The VP block is allocated once and freed when the device
+ * is released. Better not allow to change its size since its
+ * used by connect_vcpu to validate vCPU ids are valid (eg,
+ * setting it back to a higher value could allow connect_vcpu
+ * to come up with a VP id that goes beyond the VP block, which
+ * is likely to cause a crash in OPAL).
+ */
+ rc = -EBUSY;
+ else if (nr_servers > KVM_MAX_VCPUS)
+ /* We don't need more servers. Higher vCPU ids get packed
+ * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id().
+ */
+ xive->nr_servers = KVM_MAX_VCPUS;
+ else
+ xive->nr_servers = nr_servers;
+
+ mutex_unlock(&xive->lock);
+
+ return rc;
+}
+
static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
{
struct kvmppc_xive *xive = dev->private;
@@ -1753,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
switch (attr->group) {
case KVM_DEV_XICS_GRP_SOURCES:
return xive_set_source(xive, attr->attr, attr->addr);
+ case KVM_DEV_XICS_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XICS_NR_SERVERS:
+ return kvmppc_xive_set_nr_servers(xive, attr->addr);
+ }
}
return -ENXIO;
}
@@ -1778,6 +1941,11 @@ static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
attr->attr < KVMPPC_XICS_NR_IRQS)
return 0;
break;
+ case KVM_DEV_XICS_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XICS_NR_SERVERS:
+ return 0;
+ }
}
return -ENXIO;
}
@@ -1786,10 +1954,9 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
{
xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
xive_native_configure_irq(hw_num, 0, MASKED, 0);
- xive_cleanup_irq_data(xd);
}
-static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
{
int i;
@@ -1800,9 +1967,10 @@ static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
continue;
kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+ xive_cleanup_irq_data(&state->ipi_data);
xive_native_free_irq(state->ipi_number);
- /* Pass-through, cleanup too */
+ /* Pass-through, cleanup too but keep IRQ hw data */
if (state->pt_number)
kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
@@ -1810,16 +1978,53 @@ static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
}
}
-static void kvmppc_xive_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xive_release(struct kvm_device *dev)
{
struct kvmppc_xive *xive = dev->private;
struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
int i;
+ pr_devel("Releasing xive device\n");
+
+ /*
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd referring to the
+ * device. Therefore there can not be any of the device
+ * attribute set/get functions being executed concurrently,
+ * and similarly, the connect_vcpu and set/clr_mapped
+ * functions also cannot be being executed.
+ */
+
debugfs_remove(xive->dentry);
- if (kvm)
- kvm->arch.xive = NULL;
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
+ * Holding the vcpu->mutex also means that the vcpu cannot
+ * be executing the KVM_RUN ioctl, and therefore it cannot
+ * be executing the XIVE push or pull code or accessing
+ * the XIVE MMIO regions.
+ */
+ mutex_lock(&vcpu->mutex);
+ kvmppc_xive_cleanup_vcpu(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
+
+ /*
+ * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+ * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+ * against xive code getting called during vcpu execution or
+ * set/get one_reg operations.
+ */
+ kvm->arch.xive = NULL;
/* Mask and free interrupts */
for (i = 0; i <= xive->max_sbid; i++) {
@@ -1832,32 +2037,64 @@ static void kvmppc_xive_free(struct kvm_device *dev)
if (xive->vp_base != XIVE_INVALID_VP)
xive_native_free_vp_block(xive->vp_base);
+ /*
+ * A reference of the kvmppc_xive pointer is now kept under
+ * the xive_devices struct of the machine for reuse. It is
+ * freed when the VM is destroyed for now until we fix all the
+ * execution paths.
+ */
- kfree(xive);
kfree(dev);
}
+/*
+ * When the guest chooses the interrupt mode (XICS legacy or XIVE
+ * native), the VM will switch of KVM device. The previous device will
+ * be "released" before the new one is created.
+ *
+ * Until we are sure all execution paths are well protected, provide a
+ * fail safe (transitional) method for device destruction, in which
+ * the XIVE device pointer is recycled and not directly freed.
+ */
+struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
+{
+ struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
+ &kvm->arch.xive_devices.native :
+ &kvm->arch.xive_devices.xics_on_xive;
+ struct kvmppc_xive *xive = *kvm_xive_device;
+
+ if (!xive) {
+ xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+ *kvm_xive_device = xive;
+ } else {
+ memset(xive, 0, sizeof(*xive));
+ }
+
+ return xive;
+}
+
+/*
+ * Create a XICS device with XIVE backend. kvm->lock is held.
+ */
static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
struct kvm *kvm = dev->kvm;
- int ret = 0;
pr_devel("Creating xive for partition\n");
- xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+ /* Already there ? */
+ if (kvm->arch.xive)
+ return -EEXIST;
+
+ xive = kvmppc_xive_get_device(kvm, type);
if (!xive)
return -ENOMEM;
dev->private = xive;
xive->dev = dev;
xive->kvm = kvm;
-
- /* Already there ? */
- if (kvm->arch.xive)
- ret = -EEXIST;
- else
- kvm->arch.xive = xive;
+ mutex_init(&xive->lock);
/* We use the default queue size set by the host */
xive->q_order = xive_native_default_eq_shift();
@@ -1866,23 +2103,56 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
else
xive->q_page_order = xive->q_order - PAGE_SHIFT;
- /* Allocate a bunch of VPs */
- xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
- pr_devel("VP_Base=%x\n", xive->vp_base);
-
- if (xive->vp_base == XIVE_INVALID_VP)
- ret = -ENOMEM;
+ /* VP allocation is delayed to the first call to connect_vcpu */
+ xive->vp_base = XIVE_INVALID_VP;
+ /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets
+ * on a POWER9 system.
+ */
+ xive->nr_servers = KVM_MAX_VCPUS;
xive->single_escalation = xive_native_has_single_escalation();
- if (ret) {
- kfree(xive);
- return ret;
- }
-
+ kvm->arch.xive = xive;
return 0;
}
+int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ unsigned int i;
+
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+ u32 i0, i1, idx;
+
+ if (!q->qpage && !xc->esc_virq[i])
+ continue;
+
+ seq_printf(m, " [q%d]: ", i);
+
+ if (q->qpage) {
+ idx = q->idx;
+ i0 = be32_to_cpup(q->qpage + idx);
+ idx = (idx + 1) & q->msk;
+ i1 = be32_to_cpup(q->qpage + idx);
+ seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
+ i0, i1);
+ }
+ if (xc->esc_virq[i]) {
+ struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+ struct xive_irq_data *xd =
+ irq_data_get_irq_handler_data(d);
+ u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+
+ seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
+ (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
+ (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
+ xc->esc_virq[i], pq, xd->eoi_page);
+ seq_puts(m, "\n");
+ }
+ }
+ return 0;
+}
static int xive_debug_show(struct seq_file *m, void *private)
{
@@ -1908,43 +2178,17 @@ static int xive_debug_show(struct seq_file *m, void *private)
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
- unsigned int i;
if (!xc)
continue;
- seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+ seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x"
" MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
- xc->server_num, xc->cppr, xc->hw_cppr,
+ xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr,
xc->mfrr, xc->pending,
xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
- for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
- u32 i0, i1, idx;
- if (!q->qpage && !xc->esc_virq[i])
- continue;
-
- seq_printf(m, " [q%d]: ", i);
-
- if (q->qpage) {
- idx = q->idx;
- i0 = be32_to_cpup(q->qpage + idx);
- idx = (idx + 1) & q->msk;
- i1 = be32_to_cpup(q->qpage + idx);
- seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
- }
- if (xc->esc_virq[i]) {
- struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
- struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
- u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
- seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
- (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
- (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
- xc->esc_virq[i], pq, xd->eoi_page);
- seq_printf(m, "\n");
- }
- }
+ kvmppc_xive_debug_show_queues(m, vcpu);
t_rm_h_xirr += xc->stat_rm_h_xirr;
t_rm_h_ipoll += xc->stat_rm_h_ipoll;
@@ -1999,7 +2243,7 @@ struct kvm_device_ops kvm_xive_ops = {
.name = "kvm-xive",
.create = kvmppc_xive_create,
.init = kvmppc_xive_init,
- .destroy = kvmppc_xive_free,
+ .release = kvmppc_xive_release,
.set_attr = xive_set_attr,
.get_attr = xive_get_attr,
.has_attr = xive_has_attr,
OpenPOWER on IntegriCloud