diff options
Diffstat (limited to 'arch/powerpc/kvm')
27 files changed, 1734 insertions, 474 deletions
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 4c67cc79de7c..2bfeaa13befb 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -71,6 +71,9 @@ kvm-hv-y += \ book3s_64_mmu_radix.o \ book3s_hv_nested.o +kvm-hv-$(CONFIG_PPC_UV) += \ + book3s_hv_uvmem.o + kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9524d92bc45d..d07a8e12fa15 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -36,8 +36,8 @@ #include "book3s.h" #include "trace.h" -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ /* #define EXIT_DEBUG */ @@ -69,32 +69,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "pthru_all", VCPU_STAT(pthru_all) }, { "pthru_host", VCPU_STAT(pthru_host) }, { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, - { "largepages_2M", VM_STAT(num_2M_pages) }, - { "largepages_1G", VM_STAT(num_1G_pages) }, + { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) }, + { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) }, { NULL } }; -void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { - ulong pc = kvmppc_get_pc(vcpu); - ulong lr = kvmppc_get_lr(vcpu); - if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) - kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); - if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) - kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); - vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; - } -} -EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real); - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - if (!is_kvmppc_hv_enabled(vcpu->kvm)) - return to_book3s(vcpu)->hior; - return 0; -} - static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { @@ -134,11 +113,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) { - kvmppc_unfixup_split_real(vcpu); - kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); - kvmppc_set_srr1(vcpu, (kvmppc_get_msr(vcpu) & ~0x783f0000ul) | flags); - kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); - vcpu->arch.mmu.reset_msr(vcpu); + vcpu->kvm->arch.kvm_ops->inject_interrupt(vcpu, vec, flags); } static int kvmppc_book3s_vec2irqprio(unsigned int vec) @@ -496,11 +471,6 @@ int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvmppc_load_last_inst); -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - return 0; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { return 0; @@ -814,9 +784,9 @@ void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + return vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) @@ -1083,9 +1053,11 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); - kvmppc_xive_native_init_module(); - kvm_register_device_ops(&kvm_xive_native_ops, - KVM_DEV_TYPE_XIVE); + if (kvmppc_xive_native_supported()) { + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); + } } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 2ef1311a2a13..3a4613985949 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -32,4 +32,7 @@ extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val); static inline void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) {} #endif +extern void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr); +extern void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags); + #endif diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 18f244aad7aa..f21e73492ce3 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -90,11 +90,6 @@ static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16); } -static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) -{ - kvmppc_set_msr(vcpu, 0); -} - static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, u32 sre, gva_t eaddr, bool primary) @@ -406,7 +401,6 @@ void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin; mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin; mmu->xlate = kvmppc_mmu_book3s_32_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr; mmu->tlbie = kvmppc_mmu_book3s_32_tlbie; mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid; mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 5f63a5f7f24f..599133256a95 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -24,20 +24,6 @@ #define dprintk(X...) do { } while(0) #endif -static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) -{ - unsigned long msr = vcpu->arch.intr_msr; - unsigned long cur_msr = kvmppc_get_msr(vcpu); - - /* If transactional, change to suspend mode on IRQ delivery */ - if (MSR_TM_TRANSACTIONAL(cur_msr)) - msr |= MSR_TS_S; - else - msr |= cur_msr & MSR_TS_MASK; - - kvmppc_set_msr(vcpu, msr); -} - static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( struct kvm_vcpu *vcpu, gva_t eaddr) @@ -676,7 +662,6 @@ void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) mmu->slbie = kvmppc_mmu_book3s_64_slbie; mmu->slbia = kvmppc_mmu_book3s_64_slbia; mmu->xlate = kvmppc_mmu_book3s_64_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr; mmu->tlbie = kvmppc_mmu_book3s_64_tlbie; mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid; mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9a75f0e1933b..6c372f5c61b6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -275,18 +275,6 @@ int kvmppc_mmu_hv_init(void) return 0; } -static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) -{ - unsigned long msr = vcpu->arch.intr_msr; - - /* If transactional, change to suspend mode on IRQ delivery */ - if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) - msr |= MSR_TS_S; - else - msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; - kvmppc_set_msr(vcpu, msr); -} - static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret) @@ -296,7 +284,7 @@ static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, - current->mm->pgd, false, pte_idx_ret); + kvm->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ @@ -508,6 +496,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct vm_area_struct *vma; unsigned long rcbits; long mmio_update; + struct mm_struct *mm; if (kvm_is_radix(kvm)) return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); @@ -584,6 +573,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = false; pfn = 0; page = NULL; + mm = kvm->mm; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ @@ -592,8 +582,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&mm->mmap_sem); + vma = find_vma(mm, hva); if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && (vma->vm_flags & VM_PFNMAP)) { pfn = vma->vm_pgoff + @@ -602,7 +592,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); if (!pfn) goto out_put; } else { @@ -621,8 +611,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * hugepage split and collapse. */ local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, - hva, NULL, NULL); + ptep = find_current_mm_pte(mm->pgd, hva, NULL, NULL); if (ptep) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (__pte_write(pte)) @@ -2000,7 +1989,7 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); if (ret < 0) { kfree(ctx); - kvm_put_kvm(kvm); + kvm_put_kvm_no_destroy(kvm); return ret; } @@ -2161,7 +2150,6 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; - mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 2d415c36a61d..803940d79b73 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -19,6 +19,8 @@ #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/pte-walk.h> +#include <asm/ultravisor.h> +#include <asm/kvm_book3s_uvmem.h> /* * Supported radix tree geometry. @@ -61,12 +63,10 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, } isync(); - pagefault_disable(); if (is_load) - ret = raw_copy_from_user(to, from, n); + ret = probe_user_read(to, (const void __user *)from, n); else - ret = raw_copy_to_user(to, from, n); - pagefault_enable(); + ret = probe_user_write((void __user *)to, from, n); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) @@ -915,6 +915,9 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (!(dsisr & DSISR_PRTABLE_FAULT)) gpa |= ea & 0xfff; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return kvmppc_send_page_to_uv(kvm, gfn); + /* Get the corresponding memslot */ memslot = gfn_to_memslot(kvm, gfn); @@ -972,6 +975,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { + uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); + return 0; + } + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, @@ -989,6 +997,9 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, int ref = 0; unsigned long old, *rmapp; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ref; + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, @@ -1013,6 +1024,9 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ref; + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) ref = 1; @@ -1030,6 +1044,9 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, int ret = 0; unsigned long old, *rmapp; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return ret; + ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { ret = 1; @@ -1082,6 +1099,12 @@ void kvmppc_radix_flush_memslot(struct kvm *kvm, unsigned long gpa; unsigned int shift; + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) + kvmppc_uvmem_drop_pages(memslot, kvm, true); + + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return; + gpa = memslot->base_gfn << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); for (n = memslot->npages; n; --n) { diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index e99a14798ab0..ee6c103bb7d5 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -253,10 +253,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) } } + account_locked_vm(kvm->mm, + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + kvm_put_kvm(stt->kvm); - account_locked_vm(current->mm, - kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -272,6 +273,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, { struct kvmppc_spapr_tce_table *stt = NULL; struct kvmppc_spapr_tce_table *siter; + struct mm_struct *mm = kvm->mm; unsigned long npages, size = args->size; int ret = -ENOMEM; @@ -280,7 +282,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EINVAL; npages = kvmppc_tce_pages(size); - ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); + ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -317,7 +319,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, if (ret >= 0) list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); else - kvm_put_kvm(kvm); + kvm_put_kvm_no_destroy(kvm); mutex_unlock(&kvm->lock); @@ -326,7 +328,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kfree(stt); fail_acct: - account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); + account_locked_vm(mm, kvmppc_stt_pages(npages), false); return ret; } @@ -416,7 +418,7 @@ static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg(mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); } static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, @@ -447,7 +449,8 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (WARN_ON_ONCE(iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir))) + if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, + &dir))) return H_TOO_HARD; if (dir == DMA_NONE) @@ -455,7 +458,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret != H_SUCCESS) - iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -501,7 +504,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (mm_iommu_mapped_inc(mem)) return H_TOO_HARD; - ret = iommu_tce_xchg(kvm->mm, tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir); if (WARN_ON_ONCE(ret)) { mm_iommu_mapped_dec(mem); return H_TOO_HARD; @@ -579,6 +582,8 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); + iommu_tce_kill(stit->tbl, entry, 1); + if (ret != H_SUCCESS) { kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); goto unlock_exit; @@ -656,12 +661,14 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, */ if (get_user(tce, tces + i)) { ret = H_TOO_HARD; - goto unlock_exit; + goto invalidate_exit; } tce = be64_to_cpu(tce); - if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) - return H_PARAMETER; + if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { + ret = H_PARAMETER; + goto invalidate_exit; + } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, @@ -671,13 +678,17 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) { kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); - goto unlock_exit; + goto invalidate_exit; } } kvmppc_tce_put(stt, entry + i, tce); } +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill(stit->tbl, entry, npages); + unlock_exit: srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -716,7 +727,7 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - return ret; + goto invalidate_exit; WARN_ON_ONCE(1); kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); @@ -726,6 +737,10 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); - return H_SUCCESS; +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); + + return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index f50bbeedfc66..ab6eeb8e753e 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -218,13 +218,14 @@ static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, return H_SUCCESS; } -static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, +static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, + struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) { long ret; - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true); if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL))) { @@ -240,13 +241,20 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, return ret; } +extern void iommu_tce_kill_rm(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + if (tbl->it_ops->tce_kill) + tbl->it_ops->tce_kill(tbl, entry, pages, true); +} + static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -278,7 +286,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) + if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir)) /* * real mode xchg can fail if struct page crosses * a page boundary @@ -290,7 +298,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret) - iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -336,7 +344,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_TOO_HARD; - ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { mm_iommu_mapped_dec(mem); /* @@ -417,6 +425,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, stit->tbl, entry, ua, dir); + iommu_tce_kill_rm(stit->tbl, entry, 1); + if (ret != H_SUCCESS) { kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); return ret; @@ -556,8 +566,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); ua = 0; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) - return H_PARAMETER; + if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { + ret = H_PARAMETER; + goto invalidate_exit; + } list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, @@ -567,13 +579,17 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) { kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); - goto unlock_exit; + goto invalidate_exit; } } kvmppc_rm_tce_put(stt, entry + i, tce); } +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill_rm(stit->tbl, entry, npages); + unlock_exit: if (rmap) unlock_rmap(rmap); @@ -616,7 +632,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, continue; if (ret == H_TOO_HARD) - return ret; + goto invalidate_exit; WARN_ON_ONCE_RM(1); kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); @@ -626,7 +642,11 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); - return H_SUCCESS; +invalidate_exit: + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) + iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); + + return ret; } /* This can be called in either virtual mode or real mode */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cde3f5a4b3e4..2cefd071b848 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -72,6 +72,9 @@ #include <asm/xics.h> #include <asm/xive.h> #include <asm/hw_breakpoint.h> +#include <asm/kvm_host.h> +#include <asm/kvm_book3s_uvmem.h> +#include <asm/ultravisor.h> #include "book3s.h" @@ -133,7 +136,6 @@ static inline bool nesting_enabled(struct kvm *kvm) /* If set, the threads on each CPU core have to be in the same MMU mode */ static bool no_mixing_hpt_and_radix; -static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); /* @@ -338,18 +340,6 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); } -static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) -{ - /* - * Check for illegal transactional state bit combination - * and if we find it, force the TS field to a safe state. - */ - if ((msr & MSR_TS_MASK) == MSR_TS_MASK) - msr &= ~MSR_TS_MASK; - vcpu->arch.shregs.msr = msr; - kvmppc_end_cede(vcpu); -} - static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; @@ -401,8 +391,11 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) spin_lock(&vc->lock); vc->arch_compat = arch_compat; - /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ - vc->pcr = host_pcr_bit - guest_pcr_bit; + /* + * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit + * Also set all reserved PCR bits + */ + vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK; spin_unlock(&vc->lock); return 0; @@ -789,6 +782,11 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, vcpu->arch.dawr = value1; vcpu->arch.dawrx = value2; return H_SUCCESS; + case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE: + /* KVM does not support mflags=2 (AIL=2) */ + if (mflags != 0 && mflags != 3) + return H_UNSUPPORTED_FLAG_START; + return H_TOO_HARD; default: return H_TOO_HARD; } @@ -1075,6 +1073,28 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); break; + case H_SVM_PAGE_IN: + ret = kvmppc_h_svm_page_in(vcpu->kvm, + kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; + case H_SVM_PAGE_OUT: + ret = kvmppc_h_svm_page_out(vcpu->kvm, + kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; + case H_SVM_INIT_START: + ret = kvmppc_h_svm_init_start(vcpu->kvm); + break; + case H_SVM_INIT_DONE: + ret = kvmppc_h_svm_init_done(vcpu->kvm); + break; + case H_SVM_INIT_ABORT: + ret = kvmppc_h_svm_init_abort(vcpu->kvm); + break; + default: return RESUME_HOST; } @@ -1678,7 +1698,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.pspb); break; case KVM_REG_PPC_DPDES: - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + /* + * On POWER9, where we are emulating msgsndp etc., + * we return 1 bit for each vcpu, which can come from + * either vcore->dpdes or doorbell_request. + * On POWER8, doorbell_request is 0. + */ + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | + vcpu->arch.doorbell_request); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); @@ -2247,22 +2274,16 @@ static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) } #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ -static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu; int err; int core; struct kvmppc_vcore *vcore; + struct kvm *kvm; + unsigned int id; - err = -ENOMEM; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; - - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; + kvm = vcpu->kvm; + id = vcpu->vcpu_id; vcpu->arch.shared = &vcpu->arch.shregs; #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE @@ -2344,7 +2365,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, mutex_unlock(&kvm->lock); if (!vcore) - goto free_vcpu; + return err; spin_lock(&vcore->lock); ++vcore->num_threads; @@ -2359,12 +2380,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, debugfs_vcpu_init(vcpu, id); - return vcpu; - -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); -out: - return ERR_PTR(err); + return 0; } static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode, @@ -2418,8 +2434,6 @@ static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); spin_unlock(&vcpu->arch.vpa_update_lock); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) @@ -2444,15 +2458,6 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) vcpu->arch.timer_running = 1; } -static void kvmppc_end_cede(struct kvm_vcpu *vcpu) -{ - vcpu->arch.ceded = 0; - if (vcpu->arch.timer_running) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - vcpu->arch.timer_running = 0; - } -} - extern int __kvmppc_vcore_entry(void); static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, @@ -2860,7 +2865,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) if (!spin_trylock(&pvc->lock)) continue; prepare_threads(pvc); - if (!pvc->n_runnable) { + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; @@ -2881,15 +2886,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } -static bool recheck_signals(struct core_info *cip) +static bool recheck_signals_and_mmu(struct core_info *cip) { int sub, i; struct kvm_vcpu *vcpu; + struct kvmppc_vcore *vc; - for (sub = 0; sub < cip->n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, cip->vc[sub]) + for (sub = 0; sub < cip->n_subcores; ++sub) { + vc = cip->vc[sub]; + if (!vc->kvm->arch.mmu_ready) + return true; + for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) return true; + } return false; } @@ -3119,7 +3129,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { + recheck_signals_and_mmu(&core_info)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -3398,7 +3408,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, } if (vc->pcr) - mtspr(SPRN_PCR, vc->pcr); + mtspr(SPRN_PCR, vc->pcr | PCR_MASK); mtspr(SPRN_DPDES, vc->dpdes); mtspr(SPRN_VTB, vc->vtb); @@ -3478,7 +3488,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vc->vtb = mfspr(SPRN_VTB); mtspr(SPRN_DPDES, 0); if (vc->pcr) - mtspr(SPRN_PCR, 0); + mtspr(SPRN_PCR, PCR_MASK); if (vc->tb_offset_applied) { u64 new_tb = mftb() - vc->tb_offset_applied; @@ -4265,7 +4275,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) user_vrsave = mfspr(SPRN_VRSAVE); vcpu->arch.wqp = &vcpu->arch.vcore->wq; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = kvm->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; do { @@ -4496,6 +4506,29 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) && ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES)) kvmppc_radix_flush_memslot(kvm, old); + /* + * If UV hasn't yet called H_SVM_INIT_START, don't register memslots. + */ + if (!kvm->arch.secure_guest) + return; + + switch (change) { + case KVM_MR_CREATE: + if (kvmppc_uvmem_slot_init(kvm, new)) + return; + uv_register_mem_slot(kvm->arch.lpid, + new->base_gfn << PAGE_SHIFT, + new->npages * PAGE_SIZE, + 0, new->id); + break; + case KVM_MR_DELETE: + uv_unregister_mem_slot(kvm->arch.lpid, old->id); + kvmppc_uvmem_slot_free(kvm, old); + break; + default: + /* TODO: Handle KVM_MR_MOVE */ + break; + } } /* @@ -4597,14 +4630,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) /* Look up the VMA for the start of this memory slot */ hva = memslot->userspace_addr; - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); + down_read(&kvm->mm->mmap_sem); + vma = find_vma(kvm->mm, hva); if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) goto up_out; psize = vma_kernel_pagesize(vma); - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ if (psize >= 0x1000000) @@ -4637,7 +4670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) return err; up_out: - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); goto out_srcu; } @@ -4769,6 +4802,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) char buf[32]; int ret; + mutex_init(&kvm->arch.uvmem_lock); + INIT_LIST_HEAD(&kvm->arch.uvmem_pfns); mutex_init(&kvm->arch.mmu_setup_lock); /* Allocate the guest's logical partition ID */ @@ -4938,8 +4973,11 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) if (nesting_enabled(kvm)) kvmhv_release_all_nested(kvm); kvm->arch.process_table = 0; + if (kvm->arch.secure_guest) + uv_svm_terminate(kvm->arch.lpid); kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0); } + kvmppc_free_lpid(kvm->arch.lpid); kvmppc_free_pimap(kvm); @@ -5379,6 +5417,94 @@ static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, return rc; } +static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa) +{ + unpin_vpa(kvm, vpa); + vpa->gpa = 0; + vpa->pinned_addr = NULL; + vpa->dirty = false; + vpa->update_pending = 0; +} + +/* + * IOCTL handler to turn off secure mode of guest + * + * - Release all device pages + * - Issue ucall to terminate the guest on the UV side + * - Unpin the VPA pages. + * - Reinit the partition scoped page tables + */ +static int kvmhv_svm_off(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int mmu_was_ready; + int srcu_idx; + int ret = 0; + int i; + + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return ret; + + mutex_lock(&kvm->arch.mmu_setup_lock); + mmu_was_ready = kvm->arch.mmu_ready; + if (kvm->arch.mmu_ready) { + kvm->arch.mmu_ready = 0; + /* order mmu_ready vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.mmu_ready = 1; + ret = -EBUSY; + goto out; + } + } + + srcu_idx = srcu_read_lock(&kvm->srcu); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct kvm_memory_slot *memslot; + struct kvm_memslots *slots = __kvm_memslots(kvm, i); + + if (!slots) + continue; + + kvm_for_each_memslot(memslot, slots) { + kvmppc_uvmem_drop_pages(memslot, kvm, true); + uv_unregister_mem_slot(kvm->arch.lpid, memslot->id); + } + } + srcu_read_unlock(&kvm->srcu, srcu_idx); + + ret = uv_svm_terminate(kvm->arch.lpid); + if (ret != U_SUCCESS) { + ret = -EINVAL; + goto out; + } + + /* + * When secure guest is reset, all the guest pages are sent + * to UV via UV_PAGE_IN before the non-boot vcpus get a + * chance to run and unpin their VPA pages. Unpinning of all + * VPA pages is done here explicitly so that VPA pages + * can be migrated to the secure side. + * + * This is required to for the secure SMP guest to reboot + * correctly. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + unpin_vpa_reset(kvm, &vcpu->arch.dtl); + unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow); + unpin_vpa_reset(kvm, &vcpu->arch.vpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + + kvmppc_setup_partition_table(kvm); + kvm->arch.secure_guest = 0; + kvm->arch.mmu_ready = mmu_was_ready; +out: + mutex_unlock(&kvm->arch.mmu_setup_lock); + return ret; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -5386,6 +5512,7 @@ static struct kvmppc_ops kvm_ops_hv = { .set_one_reg = kvmppc_set_one_reg_hv, .vcpu_load = kvmppc_core_vcpu_load_hv, .vcpu_put = kvmppc_core_vcpu_put_hv, + .inject_interrupt = kvmppc_inject_interrupt_hv, .set_msr = kvmppc_set_msr_hv, .vcpu_run = kvmppc_vcpu_run_hv, .vcpu_create = kvmppc_core_vcpu_create_hv, @@ -5421,6 +5548,7 @@ static struct kvmppc_ops kvm_ops_hv = { .enable_nested = kvmhv_enable_nested, .load_from_eaddr = kvmhv_load_from_eaddr, .store_to_eaddr = kvmhv_store_to_eaddr, + .svm_off = kvmhv_svm_off, }; static int kvm_init_subcore_bitmap(void) @@ -5462,6 +5590,12 @@ static int kvmppc_radix_possible(void) static int kvmppc_book3s_init_hv(void) { int r; + + if (!tlbie_capable) { + pr_err("KVM-HV: Host does not support TLBIE\n"); + return -ENODEV; + } + /* * FIXME!! Do we need to check on all cpus ? */ @@ -5523,11 +5657,16 @@ static int kvmppc_book3s_init_hv(void) no_mixing_hpt_and_radix = true; } + r = kvmppc_uvmem_init(); + if (r < 0) + pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r); + return r; } static void kvmppc_book3s_exit_hv(void) { + kvmppc_uvmem_free(); kvmppc_free_host_rm_ops(); if (kvmppc_radix_possible()) kvmppc_radix_exit(); diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7c1909657b55..7cd3cf3d366b 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -755,6 +755,71 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) local_paca->kvm_hstate.kvm_split_mode = NULL; } +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } +} + +void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) +{ + /* + * Check for illegal transactional state bit combination + * and if we find it, force the TS field to a safe state. + */ + if ((msr & MSR_TS_MASK) == MSR_TS_MASK) + msr &= ~MSR_TS_MASK; + vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr_hv); + +static void inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + unsigned long msr, pc, new_msr, new_pc; + + msr = kvmppc_get_msr(vcpu); + pc = kvmppc_get_pc(vcpu); + new_msr = vcpu->arch.intr_msr; + new_pc = vec; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(msr)) + new_msr |= MSR_TS_S; + else + new_msr |= msr & MSR_TS_MASK; + + /* + * Perform MSR and PC adjustment for LPCR[AIL]=3 if it is set and + * applicable. AIL=2 is not supported. + * + * AIL does not apply to SRESET, MCE, or HMI (which is never + * delivered to the guest), and does not apply if IR=0 or DR=0. + */ + if (vec != BOOK3S_INTERRUPT_SYSTEM_RESET && + vec != BOOK3S_INTERRUPT_MACHINE_CHECK && + (vcpu->arch.vcore->lpcr & LPCR_AIL) == LPCR_AIL_3 && + (msr & (MSR_IR|MSR_DR)) == (MSR_IR|MSR_DR) ) { + new_msr |= MSR_IR | MSR_DR; + new_pc += 0xC000000000004000ULL; + } + + kvmppc_set_srr0(vcpu, pc); + kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); + kvmppc_set_pc(vcpu, new_pc); + vcpu->arch.shregs.msr = new_msr; +} + +void kvmppc_inject_interrupt_hv(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + inject_interrupt(vcpu, vec, srr1_flags); + kvmppc_end_cede(vcpu); +} +EXPORT_SYMBOL_GPL(kvmppc_inject_interrupt_hv); + /* * Is there a PRIV_DOORBELL pending for the guest (on POWER9)? * Can we inject a Decrementer or a External interrupt? @@ -762,7 +827,6 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) { int ext; - unsigned long vec = 0; unsigned long lpcr; /* Insert EXTERNAL bit into LPCR at the MER bit position */ @@ -774,26 +838,16 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu) if (vcpu->arch.shregs.msr & MSR_EE) { if (ext) { - vec = BOOK3S_INTERRUPT_EXTERNAL; + inject_interrupt(vcpu, BOOK3S_INTERRUPT_EXTERNAL, 0); } else { long int dec = mfspr(SPRN_DEC); if (!(lpcr & LPCR_LD)) dec = (int) dec; if (dec < 0) - vec = BOOK3S_INTERRUPT_DECREMENTER; + inject_interrupt(vcpu, + BOOK3S_INTERRUPT_DECREMENTER, 0); } } - if (vec) { - unsigned long msr, old_msr = vcpu->arch.shregs.msr; - - kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); - kvmppc_set_srr1(vcpu, old_msr); - kvmppc_set_pc(vcpu, vec); - msr = vcpu->arch.intr_msr; - if (MSR_TM_ACTIVE(old_msr)) - msr |= MSR_TS_S; - vcpu->arch.shregs.msr = msr; - } if (vcpu->arch.doorbell_request) { mtspr(SPRN_DPDES, 1); diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 735e0ac6f5b2..dc97e5be76f6 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -29,7 +29,7 @@ void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - hr->pcr = vc->pcr; + hr->pcr = vc->pcr | PCR_MASK; hr->dpdes = vc->dpdes; hr->hfscr = vcpu->arch.hfscr; hr->tb_offset = vc->tb_offset; @@ -65,7 +65,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->lpid = swab32(hr->lpid); hr->vcpu_token = swab32(hr->vcpu_token); hr->lpcr = swab64(hr->lpcr); - hr->pcr = swab64(hr->pcr); + hr->pcr = swab64(hr->pcr) | PCR_MASK; hr->amor = swab64(hr->amor); hr->dpdes = swab64(hr->dpdes); hr->hfscr = swab64(hr->hfscr); @@ -148,7 +148,7 @@ static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - vc->pcr = hr->pcr; + vc->pcr = hr->pcr | PCR_MASK; vc->dpdes = hr->dpdes; vcpu->arch.hfscr = hr->hfscr; vcpu->arch.dawr = hr->dawr0; @@ -398,7 +398,7 @@ static void kvmhv_flush_lpid(unsigned int lpid) long rc; if (!kvmhv_on_pseries()) { - radix__flush_tlb_lpid(lpid); + radix__flush_all_lpid(lpid); return; } @@ -411,7 +411,7 @@ static void kvmhv_flush_lpid(unsigned int lpid) void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1) { if (!kvmhv_on_pseries()) { - mmu_partition_table_set_entry(lpid, dw0, dw1); + mmu_partition_table_set_entry(lpid, dw0, dw1, true); return; } @@ -1186,7 +1186,7 @@ static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, forward_to_l1: vcpu->arch.fault_dsisr = flags; if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { - vcpu->arch.shregs.msr &= ~0x783f0000ul; + vcpu->arch.shregs.msr &= SRR1_MSR_BITS; vcpu->arch.shregs.msr |= flags; } return RESUME_HOST; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..220305454c23 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } else { rev->forw = rev->back = pte_index; *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | - pte_index | KVMPPC_RMAP_PRESENT; + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; } unlock_rmap(rmap); } @@ -433,6 +433,37 @@ static inline int is_mmio_hpte(unsigned long v, unsigned long r) (HPTE_R_KEY_HI | HPTE_R_KEY_LO)); } +static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid) +{ + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) { + /* Radix flush for a hash guest */ + + unsigned long rb,rs,prs,r,ric; + + rb = PPC_BIT(52); /* IS = 2 */ + rs = 0; /* lpid = 0 */ + prs = 0; /* partition scoped */ + r = 1; /* radix format */ + ric = 0; /* RIC_FLSUH_TLB */ + + /* + * Need the extra ptesync to make sure we don't + * re-order the tlbie + */ + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), + "i"(ric), "r"(rs) : "memory"); + } + + if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) { + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : + "r" (rb_value), "r" (lpid)); + } +} + static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, long npages, int global, bool need_sync) { @@ -451,16 +482,7 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, "r" (rbvalues[i]), "r" (kvm->arch.lpid)); } - if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) { - /* - * Need the extra ptesync to make sure we don't - * re-order the tlbie - */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : - "r" (rbvalues[0]), "r" (kvm->arch.lpid)); - } - + fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } else { if (need_sync) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 4d2ec77d806c..287d5911df0f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -58,7 +58,7 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - kvmppc_set_host_ipi(hcpu, 1); + kvmppc_set_host_ipi(hcpu); smp_mb(); kvmhv_rm_send_ipi(hcpu); } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 337e64468d78..dbc2fecc37f0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -11,6 +11,7 @@ */ #include <asm/ppc_asm.h> +#include <asm/code-patching-asm.h> #include <asm/kvm_asm.h> #include <asm/reg.h> #include <asm/mmu.h> @@ -29,6 +30,7 @@ #include <asm/asm-compat.h> #include <asm/feature-fixups.h> #include <asm/cpuidle.h> +#include <asm/ultravisor-api.h> /* Sign-extend HDEC if not on POWER9 */ #define EXTEND_HDEC(reg) \ @@ -643,8 +645,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* Load guest PCR value to select appropriate compat mode */ 37: ld r7, VCORE_PCR(r5) - cmpdi r7, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r7, r6 beq 38f + or r7, r7, r6 mtspr SPRN_PCR, r7 38: @@ -942,6 +946,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS lwz r8, VCPU_XIVE_CAM_WORD(r4) + cmpwi r8, 0 + beq no_xive li r7, TM_QW1_OS + TM_WORD2 mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ @@ -1083,16 +1089,10 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r5, VCPU_LR(r4) - ld r6, VCPU_CR(r4) mtlr r5 - mtcr r6 ld r1, VCPU_GPR(R1)(r4) - ld r2, VCPU_GPR(R2)(r4) - ld r3, VCPU_GPR(R3)(r4) ld r5, VCPU_GPR(R5)(r4) - ld r6, VCPU_GPR(R6)(r4) - ld r7, VCPU_GPR(R7)(r4) ld r8, VCPU_GPR(R8)(r4) ld r9, VCPU_GPR(R9)(r4) ld r10, VCPU_GPR(R10)(r4) @@ -1110,10 +1110,42 @@ BEGIN_FTR_SECTION mtspr SPRN_HDSISR, r0 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + ld r6, VCPU_KVM(r4) + lbz r7, KVM_SECURE_GUEST(r6) + cmpdi r7, 0 + ld r6, VCPU_GPR(R6)(r4) + ld r7, VCPU_GPR(R7)(r4) + bne ret_to_ultra + + ld r0, VCPU_CR(r4) + mtcr r0 + ld r0, VCPU_GPR(R0)(r4) + ld r2, VCPU_GPR(R2)(r4) + ld r3, VCPU_GPR(R3)(r4) ld r4, VCPU_GPR(R4)(r4) HRFI_TO_GUEST b . +/* + * Use UV_RETURN ultracall to return control back to the Ultravisor after + * processing an hypercall or interrupt that was forwarded (a.k.a. reflected) + * to the Hypervisor. + * + * All registers have already been loaded, except: + * R0 = hcall result + * R2 = SRR1, so UV can detect a synthesized interrupt (if any) + * R3 = UV_RETURN + */ +ret_to_ultra: + ld r0, VCPU_CR(r4) + mtcr r0 + + ld r0, VCPU_GPR(R3)(r4) + mfspr r2, SPRN_SRR1 + li r3, 0 + ori r3, r3, UV_RETURN + ld r4, VCPU_GPR(R4)(r4) + sc 2 /* * Enter the guest on a P9 or later system where we have exactly @@ -1456,6 +1488,13 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 1: #endif /* CONFIG_KVM_XICS */ + /* + * Possibly flush the link stack here, before we do a blr in + * guest_exit_short_path. + */ +1: nop + patch_site 1b patch__call_kvm_flush_link_stack + /* If we came in through the P9 short path, go back out to C now */ lwz r0, STACK_SLOT_SHORT_PATH(r1) cmpwi r0, 0 @@ -1762,6 +1801,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) tlbsync ptesync +BEGIN_FTR_SECTION /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) lwz r3, VCPU_GUEST_PID(r9) @@ -1791,6 +1831,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) addi r7,r7,0x1000 bdnz 1b ptesync +END_FTR_SECTION_IFSET(CPU_FTR_P9_RADIX_PREFETCH_BUG) 2: #endif /* CONFIG_PPC_RADIX_MMU */ @@ -1884,12 +1925,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Reset PCR */ ld r0, VCORE_PCR(r5) - cmpdi r0, 0 + LOAD_REG_IMMEDIATE(r6, PCR_MASK) + cmpld r0, r6 beq 18f - li r0, 0 - mtspr SPRN_PCR, r0 + mtspr SPRN_PCR, r6 18: /* Signal secondary CPUs to continue */ + li r0, 0 stb r0,VCORE_IN_GUEST(r5) 19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 @@ -1931,6 +1973,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtlr r0 blr +.balign 32 +.global kvm_flush_link_stack +kvm_flush_link_stack: + /* Save LR into r0 */ + mflr r0 + + /* Flush the link stack. On Power8 it's up to 32 entries in size. */ + .rept 32 + bl .+4 + .endr + + /* And on Power9 it's up to 64. */ +BEGIN_FTR_SECTION + .rept 32 + bl .+4 + .endr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + + /* Restore LR */ + mtlr r0 + blr + kvmppc_guest_external: /* External interrupt, first check for host_ipi. If this is * set, we know the host wants us out so let's do it now @@ -2831,29 +2895,39 @@ kvm_cede_prodded: kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) #ifdef CONFIG_KVM_XICS - /* Abort if we still have a pending escalation */ + /* are we using XIVE with single escalation? */ + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + li r6, XIVE_ESB_SET_PQ_00 + /* + * If we still have a pending escalation, abort the cede, + * and we must set PQ to 10 rather than 00 so that we don't + * potentially end up with two entries for the escalation + * interrupt in the XIVE interrupt queue. In that case + * we also don't want to set xive_esc_on to 1 here in + * case we race with xive_esc_irq(). + */ lbz r5, VCPU_XIVE_ESC_ON(r9) cmpwi r5, 0 - beq 1f + beq 4f li r0, 0 stb r0, VCPU_CEDED(r9) -1: /* Enable XIVE escalation */ - li r5, XIVE_ESB_SET_PQ_00 + li r6, XIVE_ESB_SET_PQ_10 + b 5f +4: li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ + sync +5: /* Enable XIVE escalation */ mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ beq 1f - ld r10, VCPU_XIVE_ESC_VADDR(r9) - cmpdi r10, 0 - beq 3f - ldx r0, r10, r5 + ldx r0, r10, r6 b 2f 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) - cmpdi r10, 0 - beq 3f - ldcix r0, r10, r5 + ldcix r0, r10, r6 2: sync - li r0, 1 - stb r0, VCPU_XIVE_ESC_ON(r9) #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c new file mode 100644 index 000000000000..79b1202b1c62 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Secure pages management: Migration of pages between normal and secure + * memory of KVM guests. + * + * Copyright 2018 Bharata B Rao, IBM Corp. <bharata@linux.ibm.com> + */ + +/* + * A pseries guest can be run as secure guest on Ultravisor-enabled + * POWER platforms. On such platforms, this driver will be used to manage + * the movement of guest pages between the normal memory managed by + * hypervisor (HV) and secure memory managed by Ultravisor (UV). + * + * The page-in or page-out requests from UV will come to HV as hcalls and + * HV will call back into UV via ultracalls to satisfy these page requests. + * + * Private ZONE_DEVICE memory equal to the amount of secure memory + * available in the platform for running secure guests is hotplugged. + * Whenever a page belonging to the guest becomes secure, a page from this + * private device memory is used to represent and track that secure page + * on the HV side. Some pages (like virtio buffers, VPA pages etc) are + * shared between UV and HV. However such pages aren't represented by + * device private memory and mappings to shared memory exist in both + * UV and HV page tables. + */ + +/* + * Notes on locking + * + * kvm->arch.uvmem_lock is a per-guest lock that prevents concurrent + * page-in and page-out requests for the same GPA. Concurrent accesses + * can either come via UV (guest vCPUs requesting for same page) + * or when HV and guest simultaneously access the same page. + * This mutex serializes the migration of page from HV(normal) to + * UV(secure) and vice versa. So the serialization points are around + * migrate_vma routines and page-in/out routines. + * + * Per-guest mutex comes with a cost though. Mainly it serializes the + * fault path as page-out can occur when HV faults on accessing secure + * guest pages. Currently UV issues page-in requests for all the guest + * PFNs one at a time during early boot (UV_ESM uvcall), so this is + * not a cause for concern. Also currently the number of page-outs caused + * by HV touching secure pages is very very low. If an when UV supports + * overcommitting, then we might see concurrent guest driven page-outs. + * + * Locking order + * + * 1. kvm->srcu - Protects KVM memslots + * 2. kvm->mm->mmap_sem - find_vma, migrate_vma_pages and helpers, ksm_madvise + * 3. kvm->arch.uvmem_lock - protects read/writes to uvmem slots thus acting + * as sync-points for page-in/out + */ + +/* + * Notes on page size + * + * Currently UV uses 2MB mappings internally, but will issue H_SVM_PAGE_IN + * and H_SVM_PAGE_OUT hcalls in PAGE_SIZE(64K) granularity. HV tracks + * secure GPAs at 64K page size and maintains one device PFN for each + * 64K secure GPA. UV_PAGE_IN and UV_PAGE_OUT calls by HV are also issued + * for 64K page at a time. + * + * HV faulting on secure pages: When HV touches any secure page, it + * faults and issues a UV_PAGE_OUT request with 64K page size. Currently + * UV splits and remaps the 2MB page if necessary and copies out the + * required 64K page contents. + * + * Shared pages: Whenever guest shares a secure page, UV will split and + * remap the 2MB page if required and issue H_SVM_PAGE_IN with 64K page size. + * + * HV invalidating a page: When a regular page belonging to secure + * guest gets unmapped, HV informs UV with UV_PAGE_INVAL of 64K + * page size. Using 64K page size is correct here because any non-secure + * page will essentially be of 64K page size. Splitting by UV during sharing + * and page-out ensures this. + * + * Page fault handling: When HV handles page fault of a page belonging + * to secure guest, it sends that to UV with a 64K UV_PAGE_IN request. + * Using 64K size is correct here too as UV would have split the 2MB page + * into 64k mappings and would have done page-outs earlier. + * + * In summary, the current secure pages handling code in HV assumes + * 64K page size and in fact fails any page-in/page-out requests of + * non-64K size upfront. If and when UV starts supporting multiple + * page-sizes, we need to break this assumption. + */ + +#include <linux/pagemap.h> +#include <linux/migrate.h> +#include <linux/kvm_host.h> +#include <linux/ksm.h> +#include <asm/ultravisor.h> +#include <asm/mman.h> +#include <asm/kvm_ppc.h> + +static struct dev_pagemap kvmppc_uvmem_pgmap; +static unsigned long *kvmppc_uvmem_bitmap; +static DEFINE_SPINLOCK(kvmppc_uvmem_bitmap_lock); + +#define KVMPPC_UVMEM_PFN (1UL << 63) + +struct kvmppc_uvmem_slot { + struct list_head list; + unsigned long nr_pfns; + unsigned long base_pfn; + unsigned long *pfns; +}; + +struct kvmppc_uvmem_page_pvt { + struct kvm *kvm; + unsigned long gpa; + bool skip_page_out; +}; + +int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot) +{ + struct kvmppc_uvmem_slot *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + p->pfns = vzalloc(array_size(slot->npages, sizeof(*p->pfns))); + if (!p->pfns) { + kfree(p); + return -ENOMEM; + } + p->nr_pfns = slot->npages; + p->base_pfn = slot->base_gfn; + + mutex_lock(&kvm->arch.uvmem_lock); + list_add(&p->list, &kvm->arch.uvmem_pfns); + mutex_unlock(&kvm->arch.uvmem_lock); + + return 0; +} + +/* + * All device PFNs are already released by the time we come here. + */ +void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot) +{ + struct kvmppc_uvmem_slot *p, *next; + + mutex_lock(&kvm->arch.uvmem_lock); + list_for_each_entry_safe(p, next, &kvm->arch.uvmem_pfns, list) { + if (p->base_pfn == slot->base_gfn) { + vfree(p->pfns); + list_del(&p->list); + kfree(p); + break; + } + } + mutex_unlock(&kvm->arch.uvmem_lock); +} + +static void kvmppc_uvmem_pfn_insert(unsigned long gfn, unsigned long uvmem_pfn, + struct kvm *kvm) +{ + struct kvmppc_uvmem_slot *p; + + list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) { + if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) { + unsigned long index = gfn - p->base_pfn; + + p->pfns[index] = uvmem_pfn | KVMPPC_UVMEM_PFN; + return; + } + } +} + +static void kvmppc_uvmem_pfn_remove(unsigned long gfn, struct kvm *kvm) +{ + struct kvmppc_uvmem_slot *p; + + list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) { + if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) { + p->pfns[gfn - p->base_pfn] = 0; + return; + } + } +} + +static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm, + unsigned long *uvmem_pfn) +{ + struct kvmppc_uvmem_slot *p; + + list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) { + if (gfn >= p->base_pfn && gfn < p->base_pfn + p->nr_pfns) { + unsigned long index = gfn - p->base_pfn; + + if (p->pfns[index] & KVMPPC_UVMEM_PFN) { + if (uvmem_pfn) + *uvmem_pfn = p->pfns[index] & + ~KVMPPC_UVMEM_PFN; + return true; + } else + return false; + } + } + return false; +} + +unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = H_SUCCESS; + int srcu_idx; + + if (!kvmppc_uvmem_bitmap) + return H_UNSUPPORTED; + + /* Only radix guests can be secure guests */ + if (!kvm_is_radix(kvm)) + return H_UNSUPPORTED; + + srcu_idx = srcu_read_lock(&kvm->srcu); + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + if (kvmppc_uvmem_slot_init(kvm, memslot)) { + ret = H_PARAMETER; + goto out; + } + ret = uv_register_mem_slot(kvm->arch.lpid, + memslot->base_gfn << PAGE_SHIFT, + memslot->npages * PAGE_SIZE, + 0, memslot->id); + if (ret < 0) { + kvmppc_uvmem_slot_free(kvm, memslot); + ret = H_PARAMETER; + goto out; + } + } + kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_START; +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return ret; +} + +unsigned long kvmppc_h_svm_init_done(struct kvm *kvm) +{ + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_DONE; + pr_info("LPID %d went secure\n", kvm->arch.lpid); + return H_SUCCESS; +} + +/* + * Drop device pages that we maintain for the secure guest + * + * We first mark the pages to be skipped from UV_PAGE_OUT when there + * is HV side fault on these pages. Next we *get* these pages, forcing + * fault on them, do fault time migration to replace the device PTEs in + * QEMU page table with normal PTEs from newly allocated pages. + */ +void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *free, + struct kvm *kvm, bool skip_page_out) +{ + int i; + struct kvmppc_uvmem_page_pvt *pvt; + unsigned long pfn, uvmem_pfn; + unsigned long gfn = free->base_gfn; + + for (i = free->npages; i; --i, ++gfn) { + struct page *uvmem_page; + + mutex_lock(&kvm->arch.uvmem_lock); + if (!kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { + mutex_unlock(&kvm->arch.uvmem_lock); + continue; + } + + uvmem_page = pfn_to_page(uvmem_pfn); + pvt = uvmem_page->zone_device_data; + pvt->skip_page_out = skip_page_out; + mutex_unlock(&kvm->arch.uvmem_lock); + + pfn = gfn_to_pfn(kvm, gfn); + if (is_error_noslot_pfn(pfn)) + continue; + kvm_release_pfn_clean(pfn); + } +} + +unsigned long kvmppc_h_svm_init_abort(struct kvm *kvm) +{ + int srcu_idx; + struct kvm_memory_slot *memslot; + + /* + * Expect to be called only after INIT_START and before INIT_DONE. + * If INIT_DONE was completed, use normal VM termination sequence. + */ + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) + return H_STATE; + + srcu_idx = srcu_read_lock(&kvm->srcu); + + kvm_for_each_memslot(memslot, kvm_memslots(kvm)) + kvmppc_uvmem_drop_pages(memslot, kvm, false); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + kvm->arch.secure_guest = 0; + uv_svm_terminate(kvm->arch.lpid); + + return H_PARAMETER; +} + +/* + * Get a free device PFN from the pool + * + * Called when a normal page is moved to secure memory (UV_PAGE_IN). Device + * PFN will be used to keep track of the secure page on HV side. + * + * Called with kvm->arch.uvmem_lock held + */ +static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) +{ + struct page *dpage = NULL; + unsigned long bit, uvmem_pfn; + struct kvmppc_uvmem_page_pvt *pvt; + unsigned long pfn_last, pfn_first; + + pfn_first = kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT; + pfn_last = pfn_first + + (resource_size(&kvmppc_uvmem_pgmap.res) >> PAGE_SHIFT); + + spin_lock(&kvmppc_uvmem_bitmap_lock); + bit = find_first_zero_bit(kvmppc_uvmem_bitmap, + pfn_last - pfn_first); + if (bit >= (pfn_last - pfn_first)) + goto out; + bitmap_set(kvmppc_uvmem_bitmap, bit, 1); + spin_unlock(&kvmppc_uvmem_bitmap_lock); + + pvt = kzalloc(sizeof(*pvt), GFP_KERNEL); + if (!pvt) + goto out_clear; + + uvmem_pfn = bit + pfn_first; + kvmppc_uvmem_pfn_insert(gpa >> PAGE_SHIFT, uvmem_pfn, kvm); + + pvt->gpa = gpa; + pvt->kvm = kvm; + + dpage = pfn_to_page(uvmem_pfn); + dpage->zone_device_data = pvt; + get_page(dpage); + lock_page(dpage); + return dpage; +out_clear: + spin_lock(&kvmppc_uvmem_bitmap_lock); + bitmap_clear(kvmppc_uvmem_bitmap, bit, 1); +out: + spin_unlock(&kvmppc_uvmem_bitmap_lock); + return NULL; +} + +/* + * Alloc a PFN from private device memory pool and copy page from normal + * memory to secure memory using UV_PAGE_IN uvcall. + */ +static int +kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long gpa, struct kvm *kvm, + unsigned long page_shift, bool *downgrade) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma mig; + struct page *spage; + unsigned long pfn; + struct page *dpage; + int ret = 0; + + memset(&mig, 0, sizeof(mig)); + mig.vma = vma; + mig.start = start; + mig.end = end; + mig.src = &src_pfn; + mig.dst = &dst_pfn; + + /* + * We come here with mmap_sem write lock held just for + * ksm_madvise(), otherwise we only need read mmap_sem. + * Hence downgrade to read lock once ksm_madvise() is done. + */ + ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags); + downgrade_write(&kvm->mm->mmap_sem); + *downgrade = true; + if (ret) + return ret; + + ret = migrate_vma_setup(&mig); + if (ret) + return ret; + + if (!(*mig.src & MIGRATE_PFN_MIGRATE)) { + ret = -1; + goto out_finalize; + } + + dpage = kvmppc_uvmem_get_page(gpa, kvm); + if (!dpage) { + ret = -1; + goto out_finalize; + } + + pfn = *mig.src >> MIGRATE_PFN_SHIFT; + spage = migrate_pfn_to_page(*mig.src); + if (spage) + uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, + page_shift); + + *mig.dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; + migrate_vma_pages(&mig); +out_finalize: + migrate_vma_finalize(&mig); + return ret; +} + +/* + * Shares the page with HV, thus making it a normal page. + * + * - If the page is already secure, then provision a new page and share + * - If the page is a normal page, share the existing page + * + * In the former case, uses dev_pagemap_ops.migrate_to_ram handler + * to unmap the device page from QEMU's page tables. + */ +static unsigned long +kvmppc_share_page(struct kvm *kvm, unsigned long gpa, unsigned long page_shift) +{ + + int ret = H_PARAMETER; + struct page *uvmem_page; + struct kvmppc_uvmem_page_pvt *pvt; + unsigned long pfn; + unsigned long gfn = gpa >> page_shift; + int srcu_idx; + unsigned long uvmem_pfn; + + srcu_idx = srcu_read_lock(&kvm->srcu); + mutex_lock(&kvm->arch.uvmem_lock); + if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { + uvmem_page = pfn_to_page(uvmem_pfn); + pvt = uvmem_page->zone_device_data; + pvt->skip_page_out = true; + } + +retry: + mutex_unlock(&kvm->arch.uvmem_lock); + pfn = gfn_to_pfn(kvm, gfn); + if (is_error_noslot_pfn(pfn)) + goto out; + + mutex_lock(&kvm->arch.uvmem_lock); + if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, &uvmem_pfn)) { + uvmem_page = pfn_to_page(uvmem_pfn); + pvt = uvmem_page->zone_device_data; + pvt->skip_page_out = true; + kvm_release_pfn_clean(pfn); + goto retry; + } + + if (!uv_page_in(kvm->arch.lpid, pfn << page_shift, gpa, 0, page_shift)) + ret = H_SUCCESS; + kvm_release_pfn_clean(pfn); + mutex_unlock(&kvm->arch.uvmem_lock); +out: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return ret; +} + +/* + * H_SVM_PAGE_IN: Move page from normal memory to secure memory. + * + * H_PAGE_IN_SHARED flag makes the page shared which means that the same + * memory in is visible from both UV and HV. + */ +unsigned long +kvmppc_h_svm_page_in(struct kvm *kvm, unsigned long gpa, + unsigned long flags, unsigned long page_shift) +{ + bool downgrade = false; + unsigned long start, end; + struct vm_area_struct *vma; + int srcu_idx; + unsigned long gfn = gpa >> page_shift; + int ret; + + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + if (page_shift != PAGE_SHIFT) + return H_P3; + + if (flags & ~H_PAGE_IN_SHARED) + return H_P2; + + if (flags & H_PAGE_IN_SHARED) + return kvmppc_share_page(kvm, gpa, page_shift); + + ret = H_PARAMETER; + srcu_idx = srcu_read_lock(&kvm->srcu); + down_write(&kvm->mm->mmap_sem); + + start = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(start)) + goto out; + + mutex_lock(&kvm->arch.uvmem_lock); + /* Fail the page-in request of an already paged-in page */ + if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL)) + goto out_unlock; + + end = start + (1UL << page_shift); + vma = find_vma_intersection(kvm->mm, start, end); + if (!vma || vma->vm_start > start || vma->vm_end < end) + goto out_unlock; + + if (!kvmppc_svm_page_in(vma, start, end, gpa, kvm, page_shift, + &downgrade)) + ret = H_SUCCESS; +out_unlock: + mutex_unlock(&kvm->arch.uvmem_lock); +out: + if (downgrade) + up_read(&kvm->mm->mmap_sem); + else + up_write(&kvm->mm->mmap_sem); + srcu_read_unlock(&kvm->srcu, srcu_idx); + return ret; +} + +/* + * Provision a new page on HV side and copy over the contents + * from secure memory using UV_PAGE_OUT uvcall. + */ +static int +kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned long page_shift, + struct kvm *kvm, unsigned long gpa) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma mig; + struct page *dpage, *spage; + struct kvmppc_uvmem_page_pvt *pvt; + unsigned long pfn; + int ret = U_SUCCESS; + + memset(&mig, 0, sizeof(mig)); + mig.vma = vma; + mig.start = start; + mig.end = end; + mig.src = &src_pfn; + mig.dst = &dst_pfn; + + mutex_lock(&kvm->arch.uvmem_lock); + /* The requested page is already paged-out, nothing to do */ + if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) + goto out; + + ret = migrate_vma_setup(&mig); + if (ret) + goto out; + + spage = migrate_pfn_to_page(*mig.src); + if (!spage || !(*mig.src & MIGRATE_PFN_MIGRATE)) + goto out_finalize; + + if (!is_zone_device_page(spage)) + goto out_finalize; + + dpage = alloc_page_vma(GFP_HIGHUSER, vma, start); + if (!dpage) { + ret = -1; + goto out_finalize; + } + + lock_page(dpage); + pvt = spage->zone_device_data; + pfn = page_to_pfn(dpage); + + /* + * This function is used in two cases: + * - When HV touches a secure page, for which we do UV_PAGE_OUT + * - When a secure page is converted to shared page, we *get* + * the page to essentially unmap the device page. In this + * case we skip page-out. + */ + if (!pvt->skip_page_out) + ret = uv_page_out(kvm->arch.lpid, pfn << page_shift, + gpa, 0, page_shift); + + if (ret == U_SUCCESS) + *mig.dst = migrate_pfn(pfn) | MIGRATE_PFN_LOCKED; + else { + unlock_page(dpage); + __free_page(dpage); + goto out_finalize; + } + + migrate_vma_pages(&mig); +out_finalize: + migrate_vma_finalize(&mig); +out: + mutex_unlock(&kvm->arch.uvmem_lock); + return ret; +} + +/* + * Fault handler callback that gets called when HV touches any page that + * has been moved to secure memory, we ask UV to give back the page by + * issuing UV_PAGE_OUT uvcall. + * + * This eventually results in dropping of device PFN and the newly + * provisioned page/PFN gets populated in QEMU page tables. + */ +static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) +{ + struct kvmppc_uvmem_page_pvt *pvt = vmf->page->zone_device_data; + + if (kvmppc_svm_page_out(vmf->vma, vmf->address, + vmf->address + PAGE_SIZE, PAGE_SHIFT, + pvt->kvm, pvt->gpa)) + return VM_FAULT_SIGBUS; + else + return 0; +} + +/* + * Release the device PFN back to the pool + * + * Gets called when secure page becomes a normal page during H_SVM_PAGE_OUT. + * Gets called with kvm->arch.uvmem_lock held. + */ +static void kvmppc_uvmem_page_free(struct page *page) +{ + unsigned long pfn = page_to_pfn(page) - + (kvmppc_uvmem_pgmap.res.start >> PAGE_SHIFT); + struct kvmppc_uvmem_page_pvt *pvt; + + spin_lock(&kvmppc_uvmem_bitmap_lock); + bitmap_clear(kvmppc_uvmem_bitmap, pfn, 1); + spin_unlock(&kvmppc_uvmem_bitmap_lock); + + pvt = page->zone_device_data; + page->zone_device_data = NULL; + kvmppc_uvmem_pfn_remove(pvt->gpa >> PAGE_SHIFT, pvt->kvm); + kfree(pvt); +} + +static const struct dev_pagemap_ops kvmppc_uvmem_ops = { + .page_free = kvmppc_uvmem_page_free, + .migrate_to_ram = kvmppc_uvmem_migrate_to_ram, +}; + +/* + * H_SVM_PAGE_OUT: Move page from secure memory to normal memory. + */ +unsigned long +kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, + unsigned long flags, unsigned long page_shift) +{ + unsigned long gfn = gpa >> page_shift; + unsigned long start, end; + struct vm_area_struct *vma; + int srcu_idx; + int ret; + + if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)) + return H_UNSUPPORTED; + + if (page_shift != PAGE_SHIFT) + return H_P3; + + if (flags) + return H_P2; + + ret = H_PARAMETER; + srcu_idx = srcu_read_lock(&kvm->srcu); + down_read(&kvm->mm->mmap_sem); + start = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(start)) + goto out; + + end = start + (1UL << page_shift); + vma = find_vma_intersection(kvm->mm, start, end); + if (!vma || vma->vm_start > start || vma->vm_end < end) + goto out; + + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + ret = H_SUCCESS; +out: + up_read(&kvm->mm->mmap_sem); + srcu_read_unlock(&kvm->srcu, srcu_idx); + return ret; +} + +int kvmppc_send_page_to_uv(struct kvm *kvm, unsigned long gfn) +{ + unsigned long pfn; + int ret = U_SUCCESS; + + pfn = gfn_to_pfn(kvm, gfn); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + + mutex_lock(&kvm->arch.uvmem_lock); + if (kvmppc_gfn_is_uvmem_pfn(gfn, kvm, NULL)) + goto out; + + ret = uv_page_in(kvm->arch.lpid, pfn << PAGE_SHIFT, gfn << PAGE_SHIFT, + 0, PAGE_SHIFT); +out: + kvm_release_pfn_clean(pfn); + mutex_unlock(&kvm->arch.uvmem_lock); + return (ret == U_SUCCESS) ? RESUME_GUEST : -EFAULT; +} + +static u64 kvmppc_get_secmem_size(void) +{ + struct device_node *np; + int i, len; + const __be32 *prop; + u64 size = 0; + + np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!np) + goto out; + + prop = of_get_property(np, "secure-memory-ranges", &len); + if (!prop) + goto out_put; + + for (i = 0; i < len / (sizeof(*prop) * 4); i++) + size += of_read_number(prop + (i * 4) + 2, 2); + +out_put: + of_node_put(np); +out: + return size; +} + +int kvmppc_uvmem_init(void) +{ + int ret = 0; + unsigned long size; + struct resource *res; + void *addr; + unsigned long pfn_last, pfn_first; + + size = kvmppc_get_secmem_size(); + if (!size) { + /* + * Don't fail the initialization of kvm-hv module if + * the platform doesn't export ibm,uv-firmware node. + * Let normal guests run on such PEF-disabled platform. + */ + pr_info("KVMPPC-UVMEM: No support for secure guests\n"); + goto out; + } + + res = request_free_mem_region(&iomem_resource, size, "kvmppc_uvmem"); + if (IS_ERR(res)) { + ret = PTR_ERR(res); + goto out; + } + + kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; + kvmppc_uvmem_pgmap.res = *res; + kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; + addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE); + if (IS_ERR(addr)) { + ret = PTR_ERR(addr); + goto out_free_region; + } + + pfn_first = res->start >> PAGE_SHIFT; + pfn_last = pfn_first + (resource_size(res) >> PAGE_SHIFT); + kvmppc_uvmem_bitmap = kcalloc(BITS_TO_LONGS(pfn_last - pfn_first), + sizeof(unsigned long), GFP_KERNEL); + if (!kvmppc_uvmem_bitmap) { + ret = -ENOMEM; + goto out_unmap; + } + + pr_info("KVMPPC-UVMEM: Secure Memory size 0x%lx\n", size); + return ret; +out_unmap: + memunmap_pages(&kvmppc_uvmem_pgmap); +out_free_region: + release_mem_region(res->start, size); +out: + return ret; +} + +void kvmppc_uvmem_free(void) +{ + memunmap_pages(&kvmppc_uvmem_pgmap); + release_mem_region(kvmppc_uvmem_pgmap.res.start, + resource_size(&kvmppc_uvmem_pgmap.res)); + kfree(kvmppc_uvmem_bitmap); +} diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index cc65af8fe6f7..729a0f12a752 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -90,7 +90,43 @@ static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu) kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS); } -void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu); +static void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { + ulong pc = kvmppc_get_pc(vcpu); + ulong lr = kvmppc_get_lr(vcpu); + if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK); + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; + } +} + +static void kvmppc_inject_interrupt_pr(struct kvm_vcpu *vcpu, int vec, u64 srr1_flags) +{ + unsigned long msr, pc, new_msr, new_pc; + + kvmppc_unfixup_split_real(vcpu); + + msr = kvmppc_get_msr(vcpu); + pc = kvmppc_get_pc(vcpu); + new_msr = vcpu->arch.intr_msr; + new_pc = to_book3s(vcpu)->hior + vec; + +#ifdef CONFIG_PPC_BOOK3S_64 + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(msr)) + new_msr |= MSR_TS_S; + else + new_msr |= msr & MSR_TS_MASK; +#endif + + kvmppc_set_srr0(vcpu, pc); + kvmppc_set_srr1(vcpu, (msr & SRR1_MSR_BITS) | srr1_flags); + kvmppc_set_pc(vcpu, new_pc); + kvmppc_set_msr(vcpu, new_msr); +} static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) { @@ -1708,21 +1744,17 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu_book3s; - struct kvm_vcpu *vcpu; - int err = -ENOMEM; unsigned long p; + int err; - vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu) - goto out; + err = -ENOMEM; vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); if (!vcpu_book3s) - goto free_vcpu; + goto out; vcpu->arch.book3s = vcpu_book3s; #ifdef CONFIG_KVM_BOOK3S_32_HANDLER @@ -1732,14 +1764,9 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, goto free_vcpu3s; #endif - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_shadow_vcpu; - - err = -ENOMEM; p = __get_free_page(GFP_KERNEL|__GFP_ZERO); if (!p) - goto uninit_vcpu; + goto free_shadow_vcpu; vcpu->arch.shared = (void *)p; #ifdef CONFIG_PPC_BOOK3S_64 /* Always start the shared struct in native endian mode */ @@ -1761,6 +1788,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, #else /* default to book3s_32 (750) */ vcpu->arch.pvr = 0x84202; + vcpu->arch.intr_msr = 0; #endif kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; @@ -1769,22 +1797,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, err = kvmppc_mmu_init(vcpu); if (err < 0) - goto uninit_vcpu; + goto free_shared_page; - return vcpu; + return 0; -uninit_vcpu: - kvm_vcpu_uninit(vcpu); +free_shared_page: + free_page((unsigned long)vcpu->arch.shared); free_shadow_vcpu: #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); free_vcpu3s: #endif vfree(vcpu_book3s); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu); out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) @@ -1792,12 +1818,10 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); - kvm_vcpu_uninit(vcpu); #ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); #endif vfree(vcpu_book3s); - kmem_cache_free(kvm_vcpu_cache, vcpu); } static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) @@ -1993,6 +2017,7 @@ static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, { /* We should not get called */ BUG(); + return 0; } #endif /* CONFIG_PPC64 */ @@ -2058,6 +2083,7 @@ static struct kvmppc_ops kvm_ops_pr = { .set_one_reg = kvmppc_set_one_reg_pr, .vcpu_load = kvmppc_core_vcpu_load_pr, .vcpu_put = kvmppc_core_vcpu_put_pr, + .inject_interrupt = kvmppc_inject_interrupt_pr, .set_msr = kvmppc_set_msr_pr, .vcpu_run = kvmppc_vcpu_run_pr, .vcpu_create = kvmppc_core_vcpu_create_pr, diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e3ba67095895..85215e79db42 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; u64 pq; - if (!tima) + /* + * Nothing to do if the platform doesn't have a XIVE + * or this vCPU doesn't have its own XIVE context + * (e.g. because it's not using an in-kernel interrupt controller). + */ + if (!tima || !vcpu->arch.xive_cam_word) return; + eieio(); __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); @@ -160,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data) */ vcpu->arch.xive_esc_on = false; + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ + return IRQ_HANDLED; } @@ -475,7 +484,7 @@ static void xive_finish_unmask(struct kvmppc_xive *xive, kvmppc_xive_select_irq(state, &hw_num, &xd); /* - * See command in xive_lock_and_mask() concerning masking + * See comment in xive_lock_and_mask() concerning masking * via firmware. */ if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) { @@ -1113,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.xive_esc_raddr = 0; } +/* + * In single escalation mode, the escalation interrupt is marked so + * that EOI doesn't re-enable it, but just sets the stale_p flag to + * indicate that the P bit has already been dealt with. However, the + * assembly code that enters the guest sets PQ to 00 without clearing + * stale_p (because it has no easy way to address it). Hence we have + * to adjust stale_p before shutting down the interrupt. + */ +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * This slightly odd sequence gives the right result + * (i.e. stale_p set if xive_esc_on is false) even if + * we race with xive_esc_irq() and xive_irq_eoi(). + */ + xd->stale_p = false; + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ + if (!vcpu->arch.xive_esc_on) + xd->stale_p = true; +} + void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -1134,20 +1168,28 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Mask the VP IPI */ xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - - /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); } - /* Free the queue */ + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + xive_native_disable_queue(xc->vp_id, q, i); if (q->qpage) { free_pages((unsigned long)q->qpage, @@ -1169,12 +1211,52 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xive_vcpu = NULL; } +static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) +{ + /* We have a block of xive->nr_servers VPs. We just need to check + * raw vCPU ids are below the expected limit for this guest's + * core stride ; kvmppc_pack_vcpu_id() will pack them down to an + * index that can be safely used to compute a VP id that belongs + * to the VP block. + */ + return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode; +} + +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) +{ + u32 vp_id; + + if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) { + pr_devel("Out of bounds !\n"); + return -EINVAL; + } + + if (xive->vp_base == XIVE_INVALID_VP) { + xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers); + pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers); + + if (xive->vp_base == XIVE_INVALID_VP) + return -ENOSPC; + } + + vp_id = kvmppc_xive_vp(xive, cpu); + if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { + pr_devel("Duplicate !\n"); + return -EEXIST; + } + + *vp = vp_id; + + return 0; +} + int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu) { struct kvmppc_xive *xive = dev->private; struct kvmppc_xive_vcpu *xc; int i, r = -EBUSY; + u32 vp_id; pr_devel("connect_vcpu(cpu=%d)\n", cpu); @@ -1186,25 +1268,25 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (kvmppc_xive_find_server(vcpu->kvm, cpu)) { - pr_devel("Duplicate !\n"); - return -EEXIST; - } - if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } - xc = kzalloc(sizeof(*xc), GFP_KERNEL); - if (!xc) - return -ENOMEM; /* We need to synchronize with queue provisioning */ mutex_lock(&xive->lock); + + r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id); + if (r) + goto bail; + + xc = kzalloc(sizeof(*xc), GFP_KERNEL); + if (!xc) { + r = -ENOMEM; + goto bail; + } + vcpu->arch.xive_vcpu = xc; xc->xive = xive; xc->vcpu = vcpu; xc->server_num = cpu; - xc->vp_id = kvmppc_xive_vp(xive, cpu); + xc->vp_id = vp_id; xc->mfrr = 0xff; xc->valid = true; @@ -1784,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, return 0; } +int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) +{ + u32 __user *ubufp = (u32 __user *) addr; + u32 nr_servers; + int rc = 0; + + if (get_user(nr_servers, ubufp)) + return -EFAULT; + + pr_devel("%s nr_servers=%u\n", __func__, nr_servers); + + if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) + return -EINVAL; + + mutex_lock(&xive->lock); + if (xive->vp_base != XIVE_INVALID_VP) + /* The VP block is allocated once and freed when the device + * is released. Better not allow to change its size since its + * used by connect_vcpu to validate vCPU ids are valid (eg, + * setting it back to a higher value could allow connect_vcpu + * to come up with a VP id that goes beyond the VP block, which + * is likely to cause a crash in OPAL). + */ + rc = -EBUSY; + else if (nr_servers > KVM_MAX_VCPUS) + /* We don't need more servers. Higher vCPU ids get packed + * down below KVM_MAX_VCPUS by kvmppc_pack_vcpu_id(). + */ + xive->nr_servers = KVM_MAX_VCPUS; + else + xive->nr_servers = nr_servers; + + mutex_unlock(&xive->lock); + + return rc; +} + static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { struct kvmppc_xive *xive = dev->private; @@ -1792,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) switch (attr->group) { case KVM_DEV_XICS_GRP_SOURCES: return xive_set_source(xive, attr->attr, attr->addr); + case KVM_DEV_XICS_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XICS_NR_SERVERS: + return kvmppc_xive_set_nr_servers(xive, attr->addr); + } } return -ENXIO; } @@ -1817,6 +1941,11 @@ static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) attr->attr < KVMPPC_XICS_NR_IRQS) return 0; break; + case KVM_DEV_XICS_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_XICS_NR_SERVERS: + return 0; + } } return -ENXIO; } @@ -1951,10 +2080,13 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; struct kvm *kvm = dev->kvm; - int ret = 0; pr_devel("Creating xive for partition\n"); + /* Already there ? */ + if (kvm->arch.xive) + return -EEXIST; + xive = kvmppc_xive_get_device(kvm, type); if (!xive) return -ENOMEM; @@ -1964,12 +2096,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) xive->kvm = kvm; mutex_init(&xive->lock); - /* Already there ? */ - if (kvm->arch.xive) - ret = -EEXIST; - else - kvm->arch.xive = xive; - /* We use the default queue size set by the host */ xive->q_order = xive_native_default_eq_shift(); if (xive->q_order < PAGE_SHIFT) @@ -1977,18 +2103,16 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) else xive->q_page_order = xive->q_order - PAGE_SHIFT; - /* Allocate a bunch of VPs */ - xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); - pr_devel("VP_Base=%x\n", xive->vp_base); - - if (xive->vp_base == XIVE_INVALID_VP) - ret = -ENOMEM; + /* VP allocation is delayed to the first call to connect_vcpu */ + xive->vp_base = XIVE_INVALID_VP; + /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets + * on a POWER9 system. + */ + xive->nr_servers = KVM_MAX_VCPUS; xive->single_escalation = xive_native_has_single_escalation(); - if (ret) - return ret; - + kvm->arch.xive = xive; return 0; } @@ -2058,9 +2182,9 @@ static int xive_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x" + seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x" " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", - xc->server_num, xc->cppr, xc->hw_cppr, + xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 50494d0ee375..382e3a56e789 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -135,6 +135,9 @@ struct kvmppc_xive { /* Flags */ u8 single_escalation; + /* Number of entries in the VP block */ + u32 nr_servers; + struct kvmppc_xive_ops *ops; struct address_space *mapping; struct mutex mapping_lock; @@ -220,6 +223,18 @@ static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server) return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); } +static inline bool kvmppc_xive_vp_in_use(struct kvm *kvm, u32 vp_id) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.xive_vcpu && vp_id == vcpu->arch.xive_vcpu->vp_id) + return true; + } + return false; +} + /* * Mapping between guest priorities and host priorities * is as follow. @@ -282,6 +297,10 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq); +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); +int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index a998823f68a3..6ef0151ff70a 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -50,6 +50,24 @@ static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio) } } +static int kvmppc_xive_native_configure_queue(u32 vp_id, struct xive_q *q, + u8 prio, __be32 *qpage, + u32 order, bool can_escalate) +{ + int rc; + __be32 *qpage_prev = q->qpage; + + rc = xive_native_configure_queue(vp_id, q, prio, qpage, order, + can_escalate); + if (rc) + return rc; + + if (qpage_prev) + put_page(virt_to_page(qpage_prev)); + + return rc; +} + void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -67,20 +85,28 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); xc->esc_virq[i] = 0; } + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); - /* Free the queue */ + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); } @@ -98,6 +124,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, struct kvmppc_xive *xive = dev->private; struct kvmppc_xive_vcpu *xc = NULL; int rc; + u32 vp_id; pr_devel("native_connect_vcpu(server=%d)\n", server_num); @@ -109,18 +136,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } mutex_lock(&xive->lock); - if (kvmppc_xive_find_server(vcpu->kvm, server_num)) { - pr_devel("Duplicate !\n"); - rc = -EEXIST; + rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id); + if (rc) goto bail; - } xc = kzalloc(sizeof(*xc), GFP_KERNEL); if (!xc) { @@ -133,7 +154,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, xc->vcpu = vcpu; xc->server_num = server_num; - xc->vp_id = kvmppc_xive_vp(xive, server_num); + xc->vp_id = vp_id; xc->valid = true; vcpu->arch.irq_type = KVMPPC_IRQ_XIVE; @@ -572,19 +593,14 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, q->guest_qaddr = 0; q->guest_qshift = 0; - rc = xive_native_configure_queue(xc->vp_id, q, priority, - NULL, 0, true); + rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority, + NULL, 0, true); if (rc) { pr_err("Failed to reset queue %d for VCPU %d: %d\n", priority, xc->server_num, rc); return rc; } - if (q->qpage) { - put_page(virt_to_page(q->qpage)); - q->qpage = NULL; - } - return 0; } @@ -614,17 +630,18 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, srcu_idx = srcu_read_lock(&kvm->srcu); gfn = gpa_to_gfn(kvm_eq.qaddr); - page = gfn_to_page(kvm, gfn); - if (is_error_page(page)) { + + page_size = kvm_host_page_size(vcpu, gfn); + if (1ull << kvm_eq.qshift > page_size) { srcu_read_unlock(&kvm->srcu, srcu_idx); - pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); + pr_warn("Incompatible host page size %lx!\n", page_size); return -EINVAL; } - page_size = kvm_host_page_size(kvm, gfn); - if (1ull << kvm_eq.qshift > page_size) { + page = gfn_to_page(kvm, gfn); + if (is_error_page(page)) { srcu_read_unlock(&kvm->srcu, srcu_idx); - pr_warn("Incompatible host page size %lx!\n", page_size); + pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr); return -EINVAL; } @@ -643,8 +660,8 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, * OPAL level because the use of END ESBs is not supported by * Linux. */ - rc = xive_native_configure_queue(xc->vp_id, q, priority, - (__be32 *) qaddr, kvm_eq.qshift, true); + rc = kvmppc_xive_native_configure_queue(xc->vp_id, q, priority, + (__be32 *) qaddr, kvm_eq.qshift, true); if (rc) { pr_err("Failed to configure queue %d for VCPU %d: %d\n", priority, xc->server_num, rc); @@ -918,6 +935,8 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev, return kvmppc_xive_reset(xive); case KVM_DEV_XIVE_EQ_SYNC: return kvmppc_xive_native_eq_sync(xive); + case KVM_DEV_XIVE_NR_SERVERS: + return kvmppc_xive_set_nr_servers(xive, attr->addr); } break; case KVM_DEV_XIVE_GRP_SOURCE: @@ -957,6 +976,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev, switch (attr->attr) { case KVM_DEV_XIVE_RESET: case KVM_DEV_XIVE_EQ_SYNC: + case KVM_DEV_XIVE_NR_SERVERS: return 0; } break; @@ -1057,7 +1077,6 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) { struct kvmppc_xive *xive; struct kvm *kvm = dev->kvm; - int ret = 0; pr_devel("Creating xive native device\n"); @@ -1071,27 +1090,20 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) dev->private = xive; xive->dev = dev; xive->kvm = kvm; - kvm->arch.xive = xive; mutex_init(&xive->mapping_lock); mutex_init(&xive->lock); - /* - * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for - * a default. Getting the max number of CPUs the VM was - * configured with would improve our usage of the XIVE VP space. + /* VP allocation is delayed to the first call to connect_vcpu */ + xive->vp_base = XIVE_INVALID_VP; + /* KVM_MAX_VCPUS limits the number of VMs to roughly 64 per sockets + * on a POWER9 system. */ - xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS); - pr_devel("VP_Base=%x\n", xive->vp_base); - - if (xive->vp_base == XIVE_INVALID_VP) - ret = -ENXIO; + xive->nr_servers = KVM_MAX_VCPUS; xive->single_escalation = xive_native_has_single_escalation(); xive->ops = &kvmppc_xive_native_ops; - if (ret) - return ret; - + kvm->arch.xive = xive; return 0; } @@ -1171,6 +1183,11 @@ int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) return 0; } +bool kvmppc_xive_native_supported(void) +{ + return xive_native_has_queue_state_support(); +} + static int xive_native_debug_show(struct seq_file *m, void *private) { struct kvmppc_xive *xive = m->private; @@ -1189,8 +1206,8 @@ static int xive_native_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", - xc->server_num, + seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", + xc->server_num, xc->vp_id, vcpu->arch.xive_saved_state.nsr, vcpu->arch.xive_saved_state.cppr, vcpu->arch.xive_saved_state.ipb, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index be9a45874194..7b27604adadf 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -775,7 +775,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) debug = current->thread.debug; current->thread.debug = vcpu->arch.dbg_reg; - vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.pgdir = vcpu->kvm->mm->pgd; kvmppc_fix_ee_before_entry(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -1377,36 +1377,6 @@ static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) update_timer_ints(vcpu); } -/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) -{ - int i; - int r; - - vcpu->arch.regs.nip = 0; - vcpu->arch.shared->pir = vcpu->vcpu_id; - kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ - kvmppc_set_msr(vcpu, 0); - -#ifndef CONFIG_KVM_BOOKE_HV - vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; - vcpu->arch.shadow_pid = 1; - vcpu->arch.shared->msr = 0; -#endif - - /* Eye-catching numbers so we know if the guest takes an interrupt - * before it's programmed its own IVPR/IVORs. */ - vcpu->arch.ivpr = 0x55550000; - for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) - vcpu->arch.ivor[i] = 0x7700 | i * 4; - - kvmppc_init_timing_stats(vcpu); - - r = kvmppc_core_vcpu_setup(vcpu); - kvmppc_sanity_check(vcpu); - return r; -} - int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) { /* setup watchdog timer once */ @@ -2114,9 +2084,40 @@ int kvmppc_core_init_vm(struct kvm *kvm) return kvm->arch.kvm_ops->init_vm(kvm); } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu) { - return kvm->arch.kvm_ops->vcpu_create(kvm, id); + int i; + int r; + + r = vcpu->kvm->arch.kvm_ops->vcpu_create(vcpu); + if (r) + return r; + + /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ + vcpu->arch.regs.nip = 0; + vcpu->arch.shared->pir = vcpu->vcpu_id; + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif + + /* Eye-catching numbers so we know if the guest takes an interrupt + * before it's programmed its own IVPR/IVORs. */ + vcpu->arch.ivpr = 0x55550000; + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) + vcpu->arch.ivor[i] = 0x7700 | i * 4; + + kvmppc_init_timing_stats(vcpu); + + r = kvmppc_core_vcpu_setup(vcpu); + if (r) + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); + kvmppc_sanity_check(vcpu); + return r; } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b5a848a55504..f2b4feaff6d2 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -433,28 +433,16 @@ static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = to_e500(vcpu); - vcpu = &vcpu_e500->vcpu; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - - if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) { - err = -ENOMEM; - goto uninit_vcpu; - } + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + return -ENOMEM; err = kvmppc_e500_tlb_init(vcpu_e500); if (err) @@ -466,18 +454,13 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); uninit_id: kvmppc_e500_id_table_free(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) @@ -487,8 +470,6 @@ static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500(struct kvm *kvm) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 321db0fdb9db..425d13806645 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -355,9 +355,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (tlbsel == 1) { struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); + down_read(&kvm->mm->mmap_sem); - vma = find_vma(current->mm, hva); + vma = find_vma(kvm->mm, hva); if (vma && hva >= vma->vm_start && (vma->vm_flags & VM_PFNMAP)) { /* @@ -441,7 +441,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); } - up_read(¤t->mm->mmap_sem); + up_read(&kvm->mm->mmap_sem); } if (likely(!pfnmap)) { diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 318e65c65999..e6b06cb2b92c 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -301,30 +301,20 @@ static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, return r; } -static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, - unsigned int id) +static int kvmppc_core_vcpu_create_e500mc(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500; - struct kvm_vcpu *vcpu; int err; - vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); - if (!vcpu_e500) { - err = -ENOMEM; - goto out; - } - vcpu = &vcpu_e500->vcpu; + BUILD_BUG_ON(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0); + vcpu_e500 = to_e500(vcpu); /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ vcpu->arch.oldpir = 0xffffffff; - err = kvm_vcpu_init(vcpu, kvm, id); - if (err) - goto free_vcpu; - err = kvmppc_e500_tlb_init(vcpu_e500); if (err) - goto uninit_vcpu; + return err; vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); if (!vcpu->arch.shared) { @@ -332,17 +322,11 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, goto uninit_tlb; } - return vcpu; + return 0; uninit_tlb: kvmppc_e500_tlb_uninit(vcpu_e500); -uninit_vcpu: - kvm_vcpu_uninit(vcpu); - -free_vcpu: - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); -out: - return ERR_PTR(err); + return err; } static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) @@ -351,8 +335,6 @@ static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared); kvmppc_e500_tlb_uninit(vcpu_e500); - kvm_vcpu_uninit(vcpu); - kmem_cache_free(kvm_vcpu_cache, vcpu_e500); } static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index bb4d09c1ad56..6fca38ca791f 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -271,6 +271,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) */ if (inst == KVMPPC_INST_SW_BREAKPOINT) { run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.status = 0; run->debug.arch.address = kvmppc_get_pc(vcpu); emulated = EMULATE_EXIT_USER; advance = 0; diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 9208c82ed08d..1139bc56e004 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -73,7 +73,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 inst; - int ra, rs, rt; enum emulation_result emulated = EMULATE_FAIL; int advance = 1; struct instruction_op op; @@ -85,16 +84,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) if (emulated != EMULATE_DONE) return emulated; - ra = get_ra(inst); - rs = get_rs(inst); - rt = get_rt(inst); - - /* - * if mmio_vsx_tx_sx_enabled == 0, copy data between - * VSR[0..31] and memory - * if mmio_vsx_tx_sx_enabled == 1, copy data between - * VSR[32..63] and memory - */ vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3e566c2e6066..1af96fb5dc6f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -31,6 +31,8 @@ #include <asm/hvcall.h> #include <asm/plpar_wrappers.h> #endif +#include <asm/ultravisor.h> +#include <asm/kvm_host.h> #include "timing.h" #include "irq.h" @@ -473,7 +475,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) #endif kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) @@ -522,6 +524,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; + case KVM_CAP_PPC_GUEST_DEBUG_SSTEP: + /* fall through */ case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: @@ -561,7 +565,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * a POWER9 processor) and the PowerNV platform, as * nested is not yet supported. */ - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && + kvmppc_xive_native_supported(); break; #endif @@ -715,22 +720,55 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvmppc_core_flush_memslot(kvm, slot); } -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) +{ + return 0; +} + +static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) { struct kvm_vcpu *vcpu; - vcpu = kvmppc_core_vcpu_create(kvm, id); - if (!IS_ERR(vcpu)) { - vcpu->arch.wqp = &vcpu->wq; - kvmppc_create_vcpu_debugfs(vcpu, id); - } - return vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); + kvmppc_decrementer_func(vcpu); + + return HRTIMER_NORESTART; +} + +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) +{ + int err; + + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + vcpu->arch.dec_expires = get_tb(); + +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + err = kvmppc_subarch_vcpu_init(vcpu); + if (err) + return err; + + err = kvmppc_core_vcpu_create(vcpu); + if (err) + goto out_vcpu_uninit; + + vcpu->arch.wqp = &vcpu->wq; + kvmppc_create_vcpu_debugfs(vcpu, vcpu->vcpu_id); + return 0; + +out_vcpu_uninit: + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); + return err; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { /* Make sure we're not using the vcpu anymore */ hrtimer_cancel(&vcpu->arch.dec_timer); @@ -753,11 +791,9 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) } kvmppc_core_vcpu_free(vcpu); -} -void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) -{ - kvm_arch_vcpu_free(vcpu); + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); } int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) @@ -765,37 +801,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); - kvmppc_decrementer_func(vcpu); - - return HRTIMER_NORESTART; -} - -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - int ret; - - hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = get_tb(); - -#ifdef CONFIG_KVM_EXIT_TIMING - mutex_init(&vcpu->arch.exit_timing_lock); -#endif - ret = kvmppc_subarch_vcpu_init(vcpu); - return ret; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - kvmppc_mmu_destroy(vcpu); - kvmppc_subarch_vcpu_uninit(vcpu); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_BOOKE @@ -2410,6 +2415,16 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; break; } + case KVM_PPC_SVM_OFF: { + struct kvm *kvm = filp->private_data; + + r = 0; + if (!kvm->arch.kvm_ops->svm_off) + goto out; + + r = kvm->arch.kvm_ops->svm_off(kvm); + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); |