diff options
41 files changed, 1645 insertions, 571 deletions
@@ -1120,6 +1120,7 @@ D: author of userfs filesystem D: Improved mmap and munmap handling D: General mm minor tidyups D: autofs v4 maintainer +D: Xen subsystem S: 987 Alabama St S: San Francisco S: CA, 94110 diff --git a/Documentation/tpm/xen-tpmfront.txt b/Documentation/tpm/xen-tpmfront.txt new file mode 100644 index 000000000000..69346de87ff3 --- /dev/null +++ b/Documentation/tpm/xen-tpmfront.txt @@ -0,0 +1,113 @@ +Virtual TPM interface for Xen + +Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA) + +This document describes the virtual Trusted Platform Module (vTPM) subsystem for +Xen. The reader is assumed to have familiarity with building and installing Xen, +Linux, and a basic understanding of the TPM and vTPM concepts. + +INTRODUCTION + +The goal of this work is to provide a TPM functionality to a virtual guest +operating system (in Xen terms, a DomU). This allows programs to interact with +a TPM in a virtual system the same way they interact with a TPM on the physical +system. Each guest gets its own unique, emulated, software TPM. However, each +of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain, +which seals the secrets to the Physical TPM. If the process of creating each of +these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends +the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each +major component of vTPM is implemented as a separate domain, providing secure +separation guaranteed by the hypervisor. The vTPM domains are implemented in +mini-os to reduce memory and processor overhead. + +This mini-os vTPM subsystem was built on top of the previous vTPM work done by +IBM and Intel corporation. + + +DESIGN OVERVIEW +--------------- + +The architecture of vTPM is described below: + ++------------------+ +| Linux DomU | ... +| | ^ | +| v | | +| xen-tpmfront | ++------------------+ + | ^ + v | ++------------------+ +| mini-os/tpmback | +| | ^ | +| v | | +| vtpm-stubdom | ... +| | ^ | +| v | | +| mini-os/tpmfront | ++------------------+ + | ^ + v | ++------------------+ +| mini-os/tpmback | +| | ^ | +| v | | +| vtpmmgr-stubdom | +| | ^ | +| v | | +| mini-os/tpm_tis | ++------------------+ + | ^ + v | ++------------------+ +| Hardware TPM | ++------------------+ + + * Linux DomU: The Linux based guest that wants to use a vTPM. There may be + more than one of these. + + * xen-tpmfront.ko: Linux kernel virtual TPM frontend driver. This driver + provides vTPM access to a Linux-based DomU. + + * mini-os/tpmback: Mini-os TPM backend driver. The Linux frontend driver + connects to this backend driver to facilitate communications + between the Linux DomU and its vTPM. This driver is also + used by vtpmmgr-stubdom to communicate with vtpm-stubdom. + + * vtpm-stubdom: A mini-os stub domain that implements a vTPM. There is a + one to one mapping between running vtpm-stubdom instances and + logical vtpms on the system. The vTPM Platform Configuration + Registers (PCRs) are normally all initialized to zero. + + * mini-os/tpmfront: Mini-os TPM frontend driver. The vTPM mini-os domain + vtpm-stubdom uses this driver to communicate with + vtpmmgr-stubdom. This driver is also used in mini-os + domains such as pv-grub that talk to the vTPM domain. + + * vtpmmgr-stubdom: A mini-os domain that implements the vTPM manager. There is + only one vTPM manager and it should be running during the + entire lifetime of the machine. This domain regulates + access to the physical TPM on the system and secures the + persistent state of each vTPM. + + * mini-os/tpm_tis: Mini-os TPM version 1.2 TPM Interface Specification (TIS) + driver. This driver used by vtpmmgr-stubdom to talk directly to + the hardware TPM. Communication is facilitated by mapping + hardware memory pages into vtpmmgr-stubdom. + + * Hardware TPM: The physical TPM that is soldered onto the motherboard. + + +INTEGRATION WITH XEN +-------------------- + +Support for the vTPM driver was added in Xen using the libxl toolstack in Xen +4.3. See the Xen documentation (docs/misc/vtpm.txt) for details on setting up +the vTPM and vTPM Manager stub domains. Once the stub domains are running, a +vTPM device is set up in the same manner as a disk or network device in the +domain's configuration file. + +In order to use features such as IMA that require a TPM to be loaded prior to +the initrd, the xen-tpmfront driver must be compiled in to the kernel. If not +using such features, the driver can be compiled as a module and will be loaded +as usual. diff --git a/MAINTAINERS b/MAINTAINERS index 8197fbd70a3e..94aa87dc6d2a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9247,9 +9247,9 @@ F: drivers/media/tuners/tuner-xc2028.* XEN HYPERVISOR INTERFACE M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> -M: Jeremy Fitzhardinge <jeremy@goop.org> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) -L: virtualization@lists.linux-foundation.org +M: Boris Ostrovsky <boris.ostrovsky@oracle.com> +M: David Vrabel <david.vrabel@citrix.com> +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: arch/x86/xen/ F: drivers/*/xen-*front.c @@ -9260,35 +9260,35 @@ F: include/uapi/xen/ XEN HYPERVISOR ARM M: Stefano Stabellini <stefano.stabellini@eu.citrix.com> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: arch/arm/xen/ F: arch/arm/include/asm/xen/ XEN HYPERVISOR ARM64 M: Stefano Stabellini <stefano.stabellini@eu.citrix.com> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: arch/arm64/xen/ F: arch/arm64/include/asm/xen/ XEN NETWORK BACKEND DRIVER M: Ian Campbell <ian.campbell@citrix.com> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) L: netdev@vger.kernel.org S: Supported F: drivers/net/xen-netback/* XEN PCI SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: arch/x86/pci/*xen* F: drivers/pci/*xen* XEN SWIOTLB SUBSYSTEM M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> -L: xen-devel@lists.xensource.com (moderated for non-subscribers) +L: xen-devel@lists.xenproject.org (moderated for non-subscribers) S: Supported F: arch/x86/xen/*swiotlb* F: drivers/xen/*swiotlb* diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 8a6295c86209..83e4f959ee47 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -21,6 +21,8 @@ #include <linux/of.h> #include <linux/of_irq.h> #include <linux/of_address.h> +#include <linux/cpuidle.h> +#include <linux/cpufreq.h> #include <linux/mm.h> @@ -267,18 +269,28 @@ static int __init xen_guest_init(void) if (!xen_initial_domain()) xenbus_probe(NULL); + /* + * Making sure board specific code will not set up ops for + * cpu idle and cpu freq. + */ + disable_cpuidle(); + disable_cpufreq(); + return 0; } core_initcall(xen_guest_init); static int __init xen_pm_init(void) { + if (!xen_domain()) + return -ENODEV; + pm_power_off = xen_power_off; arm_pm_restart = xen_restart; return 0; } -subsys_initcall(xen_pm_init); +late_initcall(xen_pm_init); static irqreturn_t xen_arm_callback(int irq, void *arg) { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b32ebf92b0ce..b1fb846e6dac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -632,6 +632,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP + select UNINLINE_SPIN_UNLOCK ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -656,6 +657,15 @@ config KVM_GUEST underlying device model, the host provides the guest with timing infrastructure such as time of day, and system time +config KVM_DEBUG_FS + bool "Enable debug information for KVM Guests in debugfs" + depends on KVM_GUEST && DEBUG_FS + default n + ---help--- + This option enables collection of various statistics for KVM guest. + Statistics are displayed in debugfs filesystem. Enabling this option + may incur significant overhead. + source "arch/x86/lguest/Kconfig" config PARAVIRT_TIME_ACCOUNTING diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 695399f2d5eb..427afcbf3d55 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -118,10 +118,20 @@ void kvm_async_pf_task_wait(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); -#else -#define kvm_guest_init() do { } while (0) + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +void __init kvm_spinlock_init(void); +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static inline void kvm_spinlock_init(void) +{ +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_KVM_GUEST */ +#define kvm_guest_init() do {} while (0) #define kvm_async_pf_task_wait(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) + static inline u32 kvm_read_and_reset_pf_reason(void) { return 0; diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cfdc9ee4c900..401f350ef71b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,36 +712,16 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -static inline int arch_spin_is_locked(struct arch_spinlock *lock) +static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); + PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); } -static inline int arch_spin_is_contended(struct arch_spinlock *lock) +static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, + __ticket_t ticket) { - return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_lock, lock); -} - -static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock, - unsigned long flags) -{ - PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); -} - -static __always_inline int arch_spin_trylock(struct arch_spinlock *lock) -{ - return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); -} - -static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) -{ - PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); + PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0db1fcac668c..04ac40e192eb 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -327,13 +327,15 @@ struct pv_mmu_ops { }; struct arch_spinlock; +#ifdef CONFIG_SMP +#include <asm/spinlock_types.h> +#else +typedef u16 __ticket_t; +#endif + struct pv_lock_ops { - int (*spin_is_locked)(struct arch_spinlock *lock); - int (*spin_is_contended)(struct arch_spinlock *lock); - void (*spin_lock)(struct arch_spinlock *lock); - void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags); - int (*spin_trylock)(struct arch_spinlock *lock); - void (*spin_unlock)(struct arch_spinlock *lock); + struct paravirt_callee_save lock_spinning; + void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index e3ddd7db723f..8963bfeea82a 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -1,11 +1,14 @@ #ifndef _ASM_X86_SPINLOCK_H #define _ASM_X86_SPINLOCK_H +#include <linux/jump_label.h> #include <linux/atomic.h> #include <asm/page.h> #include <asm/processor.h> #include <linux/compiler.h> #include <asm/paravirt.h> +#include <asm/bitops.h> + /* * Your basic SMP spinlocks, allowing only a single CPU anywhere * @@ -34,6 +37,31 @@ # define UNLOCK_LOCK_PREFIX #endif +/* How long a lock should spin before we consider blocking */ +#define SPIN_THRESHOLD (1 << 15) + +extern struct static_key paravirt_ticketlocks_enabled; +static __always_inline bool static_key_false(struct static_key *key); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) +{ + set_bit(0, (volatile unsigned long *)&lock->tickets.tail); +} + +#else /* !CONFIG_PARAVIRT_SPINLOCKS */ +static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} +static inline void __ticket_unlock_kick(arch_spinlock_t *lock, + __ticket_t ticket) +{ +} + +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + /* * Ticket locks are conceptually two parts, one indicating the current head of * the queue, and the other indicating the current tail. The lock is acquired @@ -47,81 +75,101 @@ * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. */ -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) +static __always_inline void arch_spin_lock(arch_spinlock_t *lock) { - register struct __raw_tickets inc = { .tail = 1 }; + register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; inc = xadd(&lock->tickets, inc); + if (likely(inc.head == inc.tail)) + goto out; + inc.tail &= ~TICKET_SLOWPATH_FLAG; for (;;) { - if (inc.head == inc.tail) - break; - cpu_relax(); - inc.head = ACCESS_ONCE(lock->tickets.head); + unsigned count = SPIN_THRESHOLD; + + do { + if (ACCESS_ONCE(lock->tickets.head) == inc.tail) + goto out; + cpu_relax(); + } while (--count); + __ticket_lock_spinning(lock, inc.tail); } - barrier(); /* make sure nothing creeps before the lock is taken */ +out: barrier(); /* make sure nothing creeps before the lock is taken */ } -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) +static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) { arch_spinlock_t old, new; old.tickets = ACCESS_ONCE(lock->tickets); - if (old.tickets.head != old.tickets.tail) + if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG)) return 0; - new.head_tail = old.head_tail + (1 << TICKET_SHIFT); + new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); /* cmpxchg is a full barrier, so nothing can move before it */ return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) +static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, + arch_spinlock_t old) { - __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); + arch_spinlock_t new; + + BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); + + /* Perform the unlock on the "before" copy */ + old.tickets.head += TICKET_LOCK_INC; + + /* Clear the slowpath flag */ + new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT); + + /* + * If the lock is uncontended, clear the flag - use cmpxchg in + * case it changes behind our back though. + */ + if (new.tickets.head != new.tickets.tail || + cmpxchg(&lock->head_tail, old.head_tail, + new.head_tail) != old.head_tail) { + /* + * Lock still has someone queued for it, so wake up an + * appropriate waiter. + */ + __ticket_unlock_kick(lock, old.tickets.head); + } } -static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) +static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + if (TICKET_SLOWPATH_FLAG && + static_key_false(¶virt_ticketlocks_enabled)) { + arch_spinlock_t prev; - return tmp.tail != tmp.head; -} + prev = *lock; + add_smp(&lock->tickets.head, TICKET_LOCK_INC); -static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); + /* add_smp() is a full mb() */ - return (__ticket_t)(tmp.tail - tmp.head) > 1; + if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG)) + __ticket_unlock_slowpath(lock, prev); + } else + __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); } -#ifndef CONFIG_PARAVIRT_SPINLOCKS - static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - return __ticket_spin_is_locked(lock); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - return __ticket_spin_is_contended(lock); -} -#define arch_spin_is_contended arch_spin_is_contended + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - __ticket_spin_lock(lock); + return tmp.tail != tmp.head; } -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) +static inline int arch_spin_is_contended(arch_spinlock_t *lock) { - return __ticket_spin_trylock(lock); -} + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - __ticket_spin_unlock(lock); + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } +#define arch_spin_is_contended arch_spin_is_contended static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) @@ -129,8 +177,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ - static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { while (arch_spin_is_locked(lock)) diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index ad0ad07fc006..4f1bea19945b 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_SPINLOCK_TYPES_H #define _ASM_X86_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #include <linux/types.h> -#if (CONFIG_NR_CPUS < 256) +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#define __TICKET_LOCK_INC 2 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) +#else +#define __TICKET_LOCK_INC 1 +#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) +#endif + +#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) typedef u8 __ticket_t; typedef u16 __ticketpair_t; #else @@ -15,6 +19,8 @@ typedef u16 __ticket_t; typedef u32 __ticketpair_t; #endif +#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) + #define TICKET_SHIFT (sizeof(__ticket_t) * 8) typedef struct arch_spinlock { diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index ca842f2769ef..608a79d5a466 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -7,6 +7,7 @@ enum ipi_vector { XEN_CALL_FUNCTION_SINGLE_VECTOR, XEN_SPIN_UNLOCK_VECTOR, XEN_IRQ_WORK_VECTOR, + XEN_NMI_VECTOR, XEN_NR_IPIS, }; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 06fdbd987e97..94dc8ca434e0 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -23,6 +23,7 @@ #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 +#define KVM_FEATURE_PV_UNHALT 7 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a96d32cc55b8..56e2fa4a8b13 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -34,6 +34,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> +#include <linux/debugfs.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -419,6 +420,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) WARN_ON(kvm_register_clock("primary cpu clock")); kvm_guest_cpu_init(); native_smp_prepare_boot_cpu(); + kvm_spinlock_init(); } static void kvm_guest_cpu_online(void *dummy) @@ -523,3 +525,263 @@ static __init int activate_jump_labels(void) return 0; } arch_initcall(activate_jump_labels); + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + +/* Kick a cpu by its apicid. Used to wake up a halted vcpu */ +static void kvm_kick_cpu(int cpu) +{ + int apicid; + unsigned long flags = 0; + + apicid = per_cpu(x86_cpu_to_apicid, cpu); + kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); +} + +enum kvm_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; + +#ifdef CONFIG_KVM_DEBUG_FS +#define HISTO_BUCKETS 30 + +static struct kvm_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + u64 time_blocked; +} spinlock_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + u8 ret; + u8 old; + + old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + } +} + +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} + + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index; + + index = ilog2(delta); + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u32 delta; + + delta = sched_clock() - start; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; +} + +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; + +struct dentry *kvm_init_debugfs(void) +{ + d_kvm_debug = debugfs_create_dir("kvm", NULL); + if (!d_kvm_debug) + printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); + + return d_kvm_debug; +} + +static int __init kvm_spinlock_debugfs(void) +{ + struct dentry *d_kvm; + + d_kvm = kvm_init_debugfs(); + if (d_kvm == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW]); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); + + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW]); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); + + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); + + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(kvm_spinlock_debugfs); +#else /* !CONFIG_KVM_DEBUG_FS */ +static inline void add_stats(enum kvm_contention_stat var, u32 val) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_KVM_DEBUG_FS */ + +struct kvm_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; +}; + +/* cpus 'waiting' on a spinlock to become available */ +static cpumask_t waiting_cpus; + +/* Track spinlock on which a cpu is waiting */ +static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); + +static void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) +{ + struct kvm_lock_waiting *w; + int cpu; + u64 start; + unsigned long flags; + + if (in_nmi()) + return; + + w = &__get_cpu_var(klock_waiting); + cpu = smp_processor_id(); + start = spin_time_start(); + + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + + /* + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; + + add_stats(TAKEN_SLOW, 1); + + /* + * This uses set_bit, which is atomic but we should not rely on its + * reordering gurantees. So barrier is needed after this call. + */ + cpumask_set_cpu(cpu, &waiting_cpus); + + barrier(); + + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); + + /* + * check again make sure it didn't become free while + * we weren't looking. + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; + local_irq_restore(flags); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); + +/* Kick vcpu waiting on @lock->head to reach value @ticket */ +static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) +{ + int cpu; + + add_stats(RELEASED_SLOW, 1); + for_each_cpu(cpu, &waiting_cpus) { + const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == ticket) { + add_stats(RELEASED_SLOW_KICKED, 1); + kvm_kick_cpu(cpu); + break; + } + } +} + +/* + * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. + */ +void __init kvm_spinlock_init(void) +{ + if (!kvm_para_available()) + return; + /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return; + + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 676b8c77a976..bbb6c7316341 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -4,25 +4,17 @@ */ #include <linux/spinlock.h> #include <linux/module.h> +#include <linux/jump_label.h> #include <asm/paravirt.h> -static inline void -default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) -{ - arch_spin_lock(lock); -} - struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_lock_flags = default_spin_lock_flags, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, + .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), + .unlock_kick = paravirt_nop, #endif }; EXPORT_SYMBOL(pv_lock_ops); +struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_ticketlocks_enabled); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 193097ef3d7d..15939e872db2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -427,8 +427,7 @@ static void __init xen_init_cpuid_mask(void) if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ - (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); @@ -735,8 +734,7 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else if (addr == (unsigned long)double_fault || - addr == (unsigned long)nmi) { + else if (addr == (unsigned long)double_fault) { /* Don't need to handle these */ return 0; #ifdef CONFIG_X86_MCE @@ -747,7 +745,12 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, */ ; #endif - } else { + } else if (addr == (unsigned long)nmi) + /* + * Use the native version as well. + */ + ; + else { /* Some other trap using IST? */ if (WARN_ON(val->ist != 0)) return 0; @@ -1710,6 +1713,8 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_shared_info(); + xen_panic_handler_init(); + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 01a4dc015ae1..0da7f863056f 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -47,23 +47,18 @@ static void xen_restore_fl(unsigned long flags) /* convert from IF type flag */ flags = !(flags & X86_EFLAGS_IF); - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ + /* See xen_irq_enable() for why preemption must be disabled. */ preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ if (flags == 0) { - preempt_check_resched(); barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); - } + preempt_enable(); + } else + preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl); @@ -82,10 +77,12 @@ static void xen_irq_enable(void) { struct vcpu_info *vcpu; - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ + /* + * We may be preempted as soon as vcpu->evtchn_upcall_mask is + * cleared, so disable preemption to ensure we check for + * events on the VCPU we are still running on. + */ + preempt_disable(); vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; @@ -96,6 +93,8 @@ static void xen_irq_enable(void) barrier(); /* unmask then check (avoid races) */ if (unlikely(vcpu->evtchn_upcall_pending)) xen_force_evtchn_callback(); + + preempt_enable(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa5927e..8b901e8d782d 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -161,6 +161,7 @@ #include <asm/xen/page.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/balloon.h> #include <xen/grant_table.h> #include "multicalls.h" @@ -967,7 +968,10 @@ int m2p_remove_override(struct page *page, if (kmap_op != NULL) { if (!PageHighMem(page)) { struct multicall_space mcs; - struct gnttab_unmap_grant_ref *unmap_op; + struct gnttab_unmap_and_replace *unmap_op; + struct page *scratch_page = get_balloon_scratch_page(); + unsigned long scratch_page_address = (unsigned long) + __va(page_to_pfn(scratch_page) << PAGE_SHIFT); /* * It might be that we queued all the m2p grant table @@ -986,25 +990,31 @@ int m2p_remove_override(struct page *page, printk(KERN_WARNING "m2p_remove_override: " "pfn %lx mfn %lx, failed to modify kernel mappings", pfn, mfn); + put_balloon_scratch_page(); return -1; } - mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + xen_mc_batch(); + + mcs = __xen_mc_entry( + sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; + unmap_op->new_addr = scratch_page_address; unmap_op->handle = kmap_op->handle; - unmap_op->dev_bus_addr = 0; MULTI_grant_table_op(mcs.mc, - GNTTABOP_unmap_grant_ref, unmap_op, 1); + GNTTABOP_unmap_and_replace, unmap_op, 1); + + mcs = __xen_mc_entry(0); + MULTI_update_va_mapping(mcs.mc, scratch_page_address, + pfn_pte(page_to_pfn(scratch_page), + PAGE_KERNEL_RO), 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); - __flush_tlb_single(address); kmap_op->host_addr = 0; + put_balloon_scratch_page(); } } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8f3eea6b80c5..09f3059cb00b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -33,6 +33,9 @@ /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; extern const char xen_failsafe_callback[]; +#ifdef CONFIG_X86_64 +extern const char nmi[]; +#endif extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); @@ -215,13 +218,19 @@ static void __init xen_set_identity_and_release_chunk( unsigned long pfn; /* - * If the PFNs are currently mapped, the VA mapping also needs - * to be updated to be 1:1. + * If the PFNs are currently mapped, clear the mappings + * (except for the ISA region which must be 1:1 mapped) to + * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + pte_t pte = __pte_ma(0); + + if (pfn < PFN_UP(ISA_END_ADDRESS)) + pte = mfn_pte(pfn, PAGE_KERNEL_IO); + (void)HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - mfn_pte(pfn, PAGE_KERNEL_IO), 0); + (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0); + } if (start_pfn < nr_pages) *released += xen_release_chunk( @@ -547,7 +556,13 @@ void xen_enable_syscall(void) } #endif /* CONFIG_X86_64 */ } - +void __cpuinit xen_enable_nmi(void) +{ +#ifdef CONFIG_X86_64 + if (register_callback(CALLBACKTYPE_nmi, nmi)) + BUG(); +#endif +} void __init xen_arch_setup(void) { xen_panic_handler_init(); @@ -565,7 +580,7 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - + xen_enable_nmi(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index b81c88e51daa..9235842cd76a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -279,6 +279,7 @@ static void __init xen_smp_prepare_boot_cpu(void) xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); + xen_init_spinlocks(); } static void __init xen_smp_prepare_cpus(unsigned int max_cpus) @@ -572,6 +573,12 @@ static inline int xen_map_vector(int vector) case IRQ_WORK_VECTOR: xen_vector = XEN_IRQ_WORK_VECTOR; break; +#ifdef CONFIG_X86_64 + case NMI_VECTOR: + case APIC_DM_NMI: /* Some use that instead of NMI_VECTOR */ + xen_vector = XEN_NMI_VECTOR; + break; +#endif default: xen_vector = -1; printk(KERN_ERR "xen: vector 0x%x is not implemented\n", @@ -680,7 +687,6 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - xen_init_spinlocks(); } static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index cf3caee356b3..0438b9324a72 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,45 +17,44 @@ #include "xen-ops.h" #include "debugfs.h" -#ifdef CONFIG_XEN_DEBUG_FS -static struct xen_spinlock_stats -{ - u64 taken; - u32 taken_slow; - u32 taken_slow_nested; - u32 taken_slow_pickup; - u32 taken_slow_spurious; - u32 taken_slow_irqenable; +enum xen_contention_stat { + TAKEN_SLOW, + TAKEN_SLOW_PICKUP, + TAKEN_SLOW_SPURIOUS, + RELEASED_SLOW, + RELEASED_SLOW_KICKED, + NR_CONTENTION_STATS +}; - u64 released; - u32 released_slow; - u32 released_slow_kicked; +#ifdef CONFIG_XEN_DEBUG_FS #define HISTO_BUCKETS 30 - u32 histo_spin_total[HISTO_BUCKETS+1]; - u32 histo_spin_spinning[HISTO_BUCKETS+1]; +static struct xen_spinlock_stats +{ + u32 contention_stats[NR_CONTENTION_STATS]; u32 histo_spin_blocked[HISTO_BUCKETS+1]; - - u64 time_total; - u64 time_spinning; u64 time_blocked; } spinlock_stats; static u8 zero_stats; -static unsigned lock_timeout = 1 << 10; -#define TIMEOUT lock_timeout - static inline void check_zero(void) { - if (unlikely(zero_stats)) { - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - zero_stats = 0; + u8 ret; + u8 old = ACCESS_ONCE(zero_stats); + if (unlikely(old)) { + ret = cmpxchg(&zero_stats, old, 0); + /* This ensures only one fellow resets the stat */ + if (ret == old) + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); } } -#define ADD_STATS(elem, val) \ - do { check_zero(); spinlock_stats.elem += (val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ + check_zero(); + spinlock_stats.contention_stats[var] += val; +} static inline u64 spin_time_start(void) { @@ -74,22 +73,6 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_spinning(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); - spinlock_stats.time_spinning += delta; -} - -static inline void spin_time_accum_total(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_total); - spinlock_stats.time_total += delta; -} - static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; @@ -99,19 +82,15 @@ static inline void spin_time_accum_blocked(u64 start) } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) -#define ADD_STATS(elem, val) do { (void)(val); } while(0) +static inline void add_stats(enum xen_contention_stat var, u32 val) +{ +} static inline u64 spin_time_start(void) { return 0; } -static inline void spin_time_accum_total(u64 start) -{ -} -static inline void spin_time_accum_spinning(u64 start) -{ -} static inline void spin_time_accum_blocked(u64 start) { } @@ -134,227 +113,123 @@ typedef u16 xen_spinners_t; asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); #endif -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - xen_spinners_t spinners; /* count of waiting cpus */ +struct xen_lock_waiting { + struct arch_spinlock *lock; + __ticket_t want; }; -static int xen_spin_is_locked(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -/* - * Mark a cpu as interested in a lock. Returns the CPU's previous - * lock of interest, in case we got preempted by an interrupt. - */ -static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) -{ - struct xen_spinlock *prev; - - prev = __this_cpu_read(lock_spinners); - __this_cpu_write(lock_spinners, xl); - - wmb(); /* set lock of interest before count */ - - inc_spinners(xl); - - return prev; -} - -/* - * Mark a cpu as no longer interested in a lock. Restores previous - * lock of interest (NULL for none). - */ -static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) -{ - dec_spinners(xl); - wmb(); /* decrement count before restoring lock */ - __this_cpu_write(lock_spinners, prev); -} +static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); +static cpumask_t waiting_cpus; -static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) +static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - struct xen_spinlock *prev; int irq = __this_cpu_read(lock_kicker_irq); - int ret; + struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting); + int cpu = smp_processor_id(); u64 start; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) - return 0; + return; start = spin_time_start(); - /* announce we're spinning */ - prev = spinning_lock(xl); + /* + * Make sure an interrupt handler can't upset things in a + * partially setup state. + */ + local_irq_save(flags); + /* + * We don't really care if we're overwriting some other + * (lock,want) pair, as that would mean that we're currently + * in an interrupt context, and the outer context had + * interrupts enabled. That has already kicked the VCPU out + * of xen_poll_irq(), so it will just return spuriously and + * retry with newly setup (lock,want). + * + * The ordering protocol on this is that the "lock" pointer + * may only be set non-NULL if the "want" ticket is correct. + * If we're updating "want", we must first clear "lock". + */ + w->lock = NULL; + smp_wmb(); + w->want = want; + smp_wmb(); + w->lock = lock; - ADD_STATS(taken_slow, 1); - ADD_STATS(taken_slow_nested, prev != NULL); + /* This uses set_bit, which atomic and therefore a barrier */ + cpumask_set_cpu(cpu, &waiting_cpus); + add_stats(TAKEN_SLOW, 1); - do { - unsigned long flags; + /* clear pending */ + xen_clear_irq_pending(irq); - /* clear pending */ - xen_clear_irq_pending(irq); + /* Only check lock once pending cleared */ + barrier(); - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) { - ADD_STATS(taken_slow_pickup, 1); + /* + * Mark entry to slowpath before doing the pickup test to make + * sure we don't deadlock with an unlocker. + */ + __ticket_enter_slowpath(lock); - /* - * If we interrupted another spinlock while it - * was blocking, make sure it doesn't block - * without rechecking the lock. - */ - if (prev != NULL) - xen_set_irq_pending(irq); - goto out; - } + /* + * check again make sure it didn't become free while + * we weren't looking + */ + if (ACCESS_ONCE(lock->tickets.head) == want) { + add_stats(TAKEN_SLOW_PICKUP, 1); + goto out; + } - flags = arch_local_save_flags(); - if (irq_enable) { - ADD_STATS(taken_slow_irqenable, 1); - raw_local_irq_enable(); - } + /* Allow interrupts while blocked */ + local_irq_restore(flags); - /* - * Block until irq becomes pending. If we're - * interrupted at this point (after the trylock but - * before entering the block), then the nested lock - * handler guarantees that the irq will be left - * pending if there's any chance the lock became free; - * xen_poll_irq() returns immediately if the irq is - * pending. - */ - xen_poll_irq(irq); + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ - raw_local_irq_restore(flags); + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); - } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ + local_irq_save(flags); kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); - out: - unspinning_lock(xl, prev); - spin_time_accum_blocked(start); - - return ret; -} - -static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - unsigned timeout; - u8 oldval; - u64 start_spin; - - ADD_STATS(taken, 1); - - start_spin = spin_time_start(); - - do { - u64 start_spin_fast = spin_time_start(); - - timeout = TIMEOUT; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); + cpumask_clear_cpu(cpu, &waiting_cpus); + w->lock = NULL; - spin_time_accum_spinning(start_spin_fast); + local_irq_restore(flags); - } while (unlikely(oldval != 0 && - (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - - spin_time_accum_total(start_spin); -} - -static void xen_spin_lock(struct arch_spinlock *lock) -{ - __xen_spin_lock(lock, false); -} - -static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags) -{ - __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); + spin_time_accum_blocked(start); } +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) { int cpu; - ADD_STATS(released_slow, 1); + add_stats(RELEASED_SLOW, 1); + + for_each_cpu(cpu, &waiting_cpus) { + const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - ADD_STATS(released_slow_kicked, 1); + /* Make sure we read lock before want */ + if (ACCESS_ONCE(w->lock) == lock && + ACCESS_ONCE(w->want) == next) { + add_stats(RELEASED_SLOW_KICKED, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; } } } -static void xen_spin_unlock(struct arch_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - ADD_STATS(released, 1); - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* - * Make sure unlock happens before checking for waiting - * spinners. We need a strong barrier to enforce the - * write-read ordering to different memory locations, as the - * CPU makes no implied guarantees about their ordering. - */ - mb(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -408,6 +283,8 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +static bool xen_pvspin __initdata = true; + void __init xen_init_spinlocks(void) { /* @@ -417,15 +294,23 @@ void __init xen_init_spinlocks(void) if (xen_hvm_domain()) return; - BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t)); + if (!xen_pvspin) { + printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); + return; + } - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; + static_key_slow_inc(¶virt_ticketlocks_enabled); + + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); + pv_lock_ops.unlock_kick = xen_unlock_kick; +} + +static __init int xen_parse_nopvspin(char *arg) +{ + xen_pvspin = false; + return 0; } +early_param("xen_nopvspin", xen_parse_nopvspin); #ifdef CONFIG_XEN_DEBUG_FS @@ -442,37 +327,21 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); - - debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.taken_slow); - debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, - &spinlock_stats.taken_slow_nested); + &spinlock_stats.contention_stats[TAKEN_SLOW]); debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.taken_slow_pickup); + &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.taken_slow_spurious); - debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, - &spinlock_stats.taken_slow_irqenable); + &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.released_slow); + &spinlock_stats.contention_stats[RELEASED_SLOW]); debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.released_slow_kicked); + &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.time_spinning); debugfs_create_u64("time_blocked", 0444, d_spin_debug, &spinlock_stats.time_blocked); - debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.time_total); - debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index dbfd56446c31..94c0c74434ea 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -91,4 +91,16 @@ config TCG_ST33_I2C To compile this driver as a module, choose M here; the module will be called tpm_stm_st33_i2c. +config TCG_XEN + tristate "XEN TPM Interface" + depends on TCG_TPM && XEN + select XEN_XENBUS_FRONTEND + ---help--- + If you want to make TPM support available to a Xen user domain, + say Yes and it will be accessible from within Linux. See + the manpages for xl, xl.conf, and docs/misc/vtpm.txt in + the Xen source repository for more details. + To compile this driver as a module, choose M here; the module + will be called xen-tpmfront. + endif # TCG_TPM diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index a3736c97c65a..eb41ff97d0ad 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_TCG_ATMEL) += tpm_atmel.o obj-$(CONFIG_TCG_INFINEON) += tpm_infineon.o obj-$(CONFIG_TCG_IBMVTPM) += tpm_ibmvtpm.o obj-$(CONFIG_TCG_ST33_I2C) += tpm_i2c_stm_st33.o +obj-$(CONFIG_TCG_XEN) += xen-tpmfront.o diff --git a/drivers/char/tpm/xen-tpmfront.c b/drivers/char/tpm/xen-tpmfront.c new file mode 100644 index 000000000000..7a7929ba2658 --- /dev/null +++ b/drivers/char/tpm/xen-tpmfront.c @@ -0,0 +1,473 @@ +/* + * Implementation of the Xen vTPM device frontend + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/interrupt.h> +#include <xen/events.h> +#include <xen/interface/io/tpmif.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/page.h> +#include "tpm.h" + +struct tpm_private { + struct tpm_chip *chip; + struct xenbus_device *dev; + + struct vtpm_shared_page *shr; + + unsigned int evtchn; + int ring_ref; + domid_t backend_id; +}; + +enum status_bits { + VTPM_STATUS_RUNNING = 0x1, + VTPM_STATUS_IDLE = 0x2, + VTPM_STATUS_RESULT = 0x4, + VTPM_STATUS_CANCELED = 0x8, +}; + +static u8 vtpm_status(struct tpm_chip *chip) +{ + struct tpm_private *priv = TPM_VPRIV(chip); + switch (priv->shr->state) { + case VTPM_STATE_IDLE: + return VTPM_STATUS_IDLE | VTPM_STATUS_CANCELED; + case VTPM_STATE_FINISH: + return VTPM_STATUS_IDLE | VTPM_STATUS_RESULT; + case VTPM_STATE_SUBMIT: + case VTPM_STATE_CANCEL: /* cancel requested, not yet canceled */ + return VTPM_STATUS_RUNNING; + default: + return 0; + } +} + +static bool vtpm_req_canceled(struct tpm_chip *chip, u8 status) +{ + return status & VTPM_STATUS_CANCELED; +} + +static void vtpm_cancel(struct tpm_chip *chip) +{ + struct tpm_private *priv = TPM_VPRIV(chip); + priv->shr->state = VTPM_STATE_CANCEL; + wmb(); + notify_remote_via_evtchn(priv->evtchn); +} + +static unsigned int shr_data_offset(struct vtpm_shared_page *shr) +{ + return sizeof(*shr) + sizeof(u32) * shr->nr_extra_pages; +} + +static int vtpm_send(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct tpm_private *priv = TPM_VPRIV(chip); + struct vtpm_shared_page *shr = priv->shr; + unsigned int offset = shr_data_offset(shr); + + u32 ordinal; + unsigned long duration; + + if (offset > PAGE_SIZE) + return -EINVAL; + + if (offset + count > PAGE_SIZE) + return -EINVAL; + + /* Wait for completion of any existing command or cancellation */ + if (wait_for_tpm_stat(chip, VTPM_STATUS_IDLE, chip->vendor.timeout_c, + &chip->vendor.read_queue, true) < 0) { + vtpm_cancel(chip); + return -ETIME; + } + + memcpy(offset + (u8 *)shr, buf, count); + shr->length = count; + barrier(); + shr->state = VTPM_STATE_SUBMIT; + wmb(); + notify_remote_via_evtchn(priv->evtchn); + + ordinal = be32_to_cpu(((struct tpm_input_header*)buf)->ordinal); + duration = tpm_calc_ordinal_duration(chip, ordinal); + + if (wait_for_tpm_stat(chip, VTPM_STATUS_IDLE, duration, + &chip->vendor.read_queue, true) < 0) { + /* got a signal or timeout, try to cancel */ + vtpm_cancel(chip); + return -ETIME; + } + + return count; +} + +static int vtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count) +{ + struct tpm_private *priv = TPM_VPRIV(chip); + struct vtpm_shared_page *shr = priv->shr; + unsigned int offset = shr_data_offset(shr); + size_t length = shr->length; + + if (shr->state == VTPM_STATE_IDLE) + return -ECANCELED; + + /* In theory the wait at the end of _send makes this one unnecessary */ + if (wait_for_tpm_stat(chip, VTPM_STATUS_RESULT, chip->vendor.timeout_c, + &chip->vendor.read_queue, true) < 0) { + vtpm_cancel(chip); + return -ETIME; + } + + if (offset > PAGE_SIZE) + return -EIO; + + if (offset + length > PAGE_SIZE) + length = PAGE_SIZE - offset; + + if (length > count) + length = count; + + memcpy(buf, offset + (u8 *)shr, length); + + return length; +} + +ssize_t tpm_show_locality(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct tpm_private *priv = TPM_VPRIV(chip); + u8 locality = priv->shr->locality; + + return sprintf(buf, "%d\n", locality); +} + +ssize_t tpm_store_locality(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + struct tpm_private *priv = TPM_VPRIV(chip); + u8 val; + + int rv = kstrtou8(buf, 0, &val); + if (rv) + return rv; + + priv->shr->locality = val; + + return len; +} + +static const struct file_operations vtpm_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .open = tpm_open, + .read = tpm_read, + .write = tpm_write, + .release = tpm_release, +}; + +static DEVICE_ATTR(pubek, S_IRUGO, tpm_show_pubek, NULL); +static DEVICE_ATTR(pcrs, S_IRUGO, tpm_show_pcrs, NULL); +static DEVICE_ATTR(enabled, S_IRUGO, tpm_show_enabled, NULL); +static DEVICE_ATTR(active, S_IRUGO, tpm_show_active, NULL); +static DEVICE_ATTR(owned, S_IRUGO, tpm_show_owned, NULL); +static DEVICE_ATTR(temp_deactivated, S_IRUGO, tpm_show_temp_deactivated, + NULL); +static DEVICE_ATTR(caps, S_IRUGO, tpm_show_caps, NULL); +static DEVICE_ATTR(cancel, S_IWUSR | S_IWGRP, NULL, tpm_store_cancel); +static DEVICE_ATTR(durations, S_IRUGO, tpm_show_durations, NULL); +static DEVICE_ATTR(timeouts, S_IRUGO, tpm_show_timeouts, NULL); +static DEVICE_ATTR(locality, S_IRUGO | S_IWUSR, tpm_show_locality, + tpm_store_locality); + +static struct attribute *vtpm_attrs[] = { + &dev_attr_pubek.attr, + &dev_attr_pcrs.attr, + &dev_attr_enabled.attr, + &dev_attr_active.attr, + &dev_attr_owned.attr, + &dev_attr_temp_deactivated.attr, + &dev_attr_caps.attr, + &dev_attr_cancel.attr, + &dev_attr_durations.attr, + &dev_attr_timeouts.attr, + &dev_attr_locality.attr, + NULL, +}; + +static struct attribute_group vtpm_attr_grp = { + .attrs = vtpm_attrs, +}; + +#define TPM_LONG_TIMEOUT (10 * 60 * HZ) + +static const struct tpm_vendor_specific tpm_vtpm = { + .status = vtpm_status, + .recv = vtpm_recv, + .send = vtpm_send, + .cancel = vtpm_cancel, + .req_complete_mask = VTPM_STATUS_IDLE | VTPM_STATUS_RESULT, + .req_complete_val = VTPM_STATUS_IDLE | VTPM_STATUS_RESULT, + .req_canceled = vtpm_req_canceled, + .attr_group = &vtpm_attr_grp, + .miscdev = { + .fops = &vtpm_ops, + }, + .duration = { + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + TPM_LONG_TIMEOUT, + }, +}; + +static irqreturn_t tpmif_interrupt(int dummy, void *dev_id) +{ + struct tpm_private *priv = dev_id; + + switch (priv->shr->state) { + case VTPM_STATE_IDLE: + case VTPM_STATE_FINISH: + wake_up_interruptible(&priv->chip->vendor.read_queue); + break; + case VTPM_STATE_SUBMIT: + case VTPM_STATE_CANCEL: + default: + break; + } + return IRQ_HANDLED; +} + +static int setup_chip(struct device *dev, struct tpm_private *priv) +{ + struct tpm_chip *chip; + + chip = tpm_register_hardware(dev, &tpm_vtpm); + if (!chip) + return -ENODEV; + + init_waitqueue_head(&chip->vendor.read_queue); + + priv->chip = chip; + TPM_VPRIV(chip) = priv; + + return 0; +} + +/* caller must clean up in case of errors */ +static int setup_ring(struct xenbus_device *dev, struct tpm_private *priv) +{ + struct xenbus_transaction xbt; + const char *message = NULL; + int rv; + + priv->shr = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!priv->shr) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + + rv = xenbus_grant_ring(dev, virt_to_mfn(priv->shr)); + if (rv < 0) + return rv; + + priv->ring_ref = rv; + + rv = xenbus_alloc_evtchn(dev, &priv->evtchn); + if (rv) + return rv; + + rv = bind_evtchn_to_irqhandler(priv->evtchn, tpmif_interrupt, 0, + "tpmif", priv); + if (rv <= 0) { + xenbus_dev_fatal(dev, rv, "allocating TPM irq"); + return rv; + } + priv->chip->vendor.irq = rv; + + again: + rv = xenbus_transaction_start(&xbt); + if (rv) { + xenbus_dev_fatal(dev, rv, "starting transaction"); + return rv; + } + + rv = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", priv->ring_ref); + if (rv) { + message = "writing ring-ref"; + goto abort_transaction; + } + + rv = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", + priv->evtchn); + if (rv) { + message = "writing event-channel"; + goto abort_transaction; + } + + rv = xenbus_printf(xbt, dev->nodename, "feature-protocol-v2", "1"); + if (rv) { + message = "writing feature-protocol-v2"; + goto abort_transaction; + } + + rv = xenbus_transaction_end(xbt, 0); + if (rv == -EAGAIN) + goto again; + if (rv) { + xenbus_dev_fatal(dev, rv, "completing transaction"); + return rv; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_error(dev, rv, "%s", message); + + return rv; +} + +static void ring_free(struct tpm_private *priv) +{ + if (!priv) + return; + + if (priv->ring_ref) + gnttab_end_foreign_access(priv->ring_ref, 0, + (unsigned long)priv->shr); + else + free_page((unsigned long)priv->shr); + + if (priv->chip && priv->chip->vendor.irq) + unbind_from_irqhandler(priv->chip->vendor.irq, priv); + + kfree(priv); +} + +static int tpmfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct tpm_private *priv; + int rv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating priv structure"); + return -ENOMEM; + } + + rv = setup_chip(&dev->dev, priv); + if (rv) { + kfree(priv); + return rv; + } + + rv = setup_ring(dev, priv); + if (rv) { + tpm_remove_hardware(&dev->dev); + ring_free(priv); + return rv; + } + + tpm_get_timeouts(priv->chip); + + dev_set_drvdata(&dev->dev, priv->chip); + + return rv; +} + +static int tpmfront_remove(struct xenbus_device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(&dev->dev); + struct tpm_private *priv = TPM_VPRIV(chip); + tpm_remove_hardware(&dev->dev); + ring_free(priv); + TPM_VPRIV(chip) = NULL; + return 0; +} + +static int tpmfront_resume(struct xenbus_device *dev) +{ + /* A suspend/resume/migrate will interrupt a vTPM anyway */ + tpmfront_remove(dev); + return tpmfront_probe(dev, NULL); +} + +static void backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + int val; + + switch (backend_state) { + case XenbusStateInitialised: + case XenbusStateConnected: + if (dev->state == XenbusStateConnected) + break; + + if (xenbus_scanf(XBT_NIL, dev->otherend, + "feature-protocol-v2", "%d", &val) < 0) + val = 0; + if (!val) { + xenbus_dev_fatal(dev, -EINVAL, + "vTPM protocol 2 required"); + return; + } + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + case XenbusStateClosed: + device_unregister(&dev->dev); + xenbus_frontend_closed(dev); + break; + default: + break; + } +} + +static const struct xenbus_device_id tpmfront_ids[] = { + { "vtpm" }, + { "" } +}; +MODULE_ALIAS("xen:vtpm"); + +static DEFINE_XENBUS_DRIVER(tpmfront, , + .probe = tpmfront_probe, + .remove = tpmfront_remove, + .resume = tpmfront_resume, + .otherend_changed = backend_changed, + ); + +static int __init xen_tpmfront_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + return xenbus_register_frontend(&tpmfront_driver); +} +module_init(xen_tpmfront_init); + +static void __exit xen_tpmfront_exit(void) +{ + xenbus_unregister_driver(&tpmfront_driver); +} +module_exit(xen_tpmfront_exit); + +MODULE_AUTHOR("Daniel De Graaf <dgdegra@tycho.nsa.gov>"); +MODULE_DESCRIPTION("Xen vTPM Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 682210d778bd..e61c36cbb866 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -208,7 +208,7 @@ static int xen_hvm_console_init(void) info = vtermno_to_xencons(HVC_COOKIE); if (!info) { - info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO); + info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL); if (!info) return -ENOMEM; } else if (info->intf != NULL) { @@ -257,7 +257,7 @@ static int xen_pv_console_init(void) info = vtermno_to_xencons(HVC_COOKIE); if (!info) { - info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO); + info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL); if (!info) return -ENOMEM; } else if (info->intf != NULL) { @@ -284,7 +284,7 @@ static int xen_initial_domain_console_init(void) info = vtermno_to_xencons(HVC_COOKIE); if (!info) { - info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL | __GFP_ZERO); + info = kzalloc(sizeof(struct xencons_info), GFP_KERNEL); if (!info) return -ENOMEM; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2a2ef97697b2..3101cf6daf56 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt +#include <linux/cpu.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/errno.h> @@ -52,6 +53,7 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -90,6 +92,8 @@ EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static DEFINE_PER_CPU(struct page *, balloon_scratch_page); + /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -412,7 +416,8 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) if (xen_pv_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); + pfn_pte(page_to_pfn(__get_cpu_var(balloon_scratch_page)), + PAGE_KERNEL_RO), 0); BUG_ON(ret); } #endif @@ -425,7 +430,13 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long p; + struct page *pg; + pg = __get_cpu_var(balloon_scratch_page); + p = page_to_pfn(pg); + __set_phys_to_machine(pfn, pfn_to_mfn(p)); + } balloon_append(pfn_to_page(pfn)); } @@ -480,6 +491,18 @@ static void balloon_process(struct work_struct *work) mutex_unlock(&balloon_mutex); } +struct page *get_balloon_scratch_page(void) +{ + struct page *ret = get_cpu_var(balloon_scratch_page); + BUG_ON(ret == NULL); + return ret; +} + +void put_balloon_scratch_page(void) +{ + put_cpu_var(balloon_scratch_page); +} + /* Resets the Xen limit, sets new target, and kicks off processing. */ void balloon_set_new_target(unsigned long target) { @@ -573,13 +596,47 @@ static void __init balloon_add_region(unsigned long start_pfn, } } +static int __cpuinit balloon_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + if (per_cpu(balloon_scratch_page, cpu) != NULL) + break; + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return NOTIFY_BAD; + } + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block balloon_cpu_notifier __cpuinitdata = { + .notifier_call = balloon_cpu_notify, +}; + static int __init balloon_init(void) { - int i; + int i, cpu; if (!xen_domain()) return -ENODEV; + for_each_online_cpu(cpu) + { + per_cpu(balloon_scratch_page, cpu) = alloc_page(GFP_KERNEL); + if (per_cpu(balloon_scratch_page, cpu) == NULL) { + pr_warn("Failed to allocate balloon_scratch_page for cpu %d\n", cpu); + return -ENOMEM; + } + } + register_cpu_notifier(&balloon_cpu_notifier); + pr_info("Initialising balloon driver\n"); balloon_stats.current_pages = xen_pv_domain() @@ -616,4 +673,15 @@ static int __init balloon_init(void) subsys_initcall(balloon_init); +static int __init balloon_clear(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(balloon_scratch_page, cpu) = NULL; + + return 0; +} +early_initcall(balloon_clear); + MODULE_LICENSE("GPL"); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 5e8be462aed5..4035e833ea26 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -56,6 +56,7 @@ #include <xen/interface/hvm/params.h> #include <xen/interface/physdev.h> #include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> #include <asm/hw_irq.h> /* @@ -1212,7 +1213,17 @@ EXPORT_SYMBOL_GPL(evtchn_put); void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) { - int irq = per_cpu(ipi_to_irq, cpu)[vector]; + int irq; + +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; + } +#endif + irq = per_cpu(ipi_to_irq, cpu)[vector]; BUG_ON(irq < 0); notify_remote_via_irq(irq); } @@ -1379,14 +1390,21 @@ static void __xen_evtchn_do_upcall(void) pending_bits = active_evtchns(cpu, s, word_idx); bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ if (word_idx == start_word_idx) { - /* We scan the starting word in two parts */ if (i == 0) - /* 1st time: start in the middle */ bit_idx = start_bit_idx; - else - /* 2nd time: mask bits done already */ - bit_idx &= (1UL << start_bit_idx) - 1; } do { diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index b6165e047f48..8b3a69a06c39 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -57,6 +57,7 @@ struct per_user_data { struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) @@ -64,6 +65,7 @@ struct per_user_data { evtchn_port_t *ring; unsigned int ring_cons, ring_prod, ring_overflow; struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ /* Processes wait on this queue when ring is empty. */ wait_queue_head_t evtchn_wait; @@ -71,54 +73,79 @@ struct per_user_data { const char *name; }; -/* - * Who's bound to each port? This is logically an array of struct - * per_user_data *, but we encode the current enabled-state in bit 0. - */ -static unsigned long *port_user; -static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + unsigned port; + bool enabled; +}; -static inline struct per_user_data *get_port_user(unsigned port) +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return (struct per_user_data *)(port_user[port] & ~1); -} + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; -static inline void set_port_user(unsigned port, struct per_user_data *u) -{ - port_user[port] = (unsigned long)u; + while (*new) { + struct user_evtchn *this; + + this = container_of(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; } -static inline bool get_port_enabled(unsigned port) +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) { - return port_user[port] & 1; + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); } -static inline void set_port_enabled(unsigned port, bool enabled) +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) { - if (enabled) - port_user[port] |= 1; - else - port_user[port] &= ~1; + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = container_of(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; } static irqreturn_t evtchn_interrupt(int irq, void *data) { - unsigned int port = (unsigned long)data; - struct per_user_data *u; - - spin_lock(&port_user_lock); - - u = get_port_user(port); + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; - WARN(!get_port_enabled(port), + WARN(!evtchn->enabled, "Interrupt for port %d, but apparently not enabled; per-user %p\n", - port, u); + evtchn->port, u); disable_irq_nosync(irq); - set_port_enabled(port, false); + evtchn->enabled = false; + + spin_lock(&u->ring_prod_lock); if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; wmb(); /* Ensure ring contents visible */ if (u->ring_cons == u->ring_prod++) { wake_up_interruptible(&u->evtchn_wait); @@ -128,7 +155,7 @@ static irqreturn_t evtchn_interrupt(int irq, void *data) } else u->ring_overflow = 1; - spin_unlock(&port_user_lock); + spin_unlock(&u->ring_prod_lock); return IRQ_HANDLED; } @@ -229,20 +256,20 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, if (copy_from_user(kbuf, buf, count) != 0) goto out; - spin_lock_irq(&port_user_lock); + mutex_lock(&u->bind_mutex); for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { unsigned port = kbuf[i]; + struct user_evtchn *evtchn; - if (port < NR_EVENT_CHANNELS && - get_port_user(port) == u && - !get_port_enabled(port)) { - set_port_enabled(port, true); + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; enable_irq(irq_from_evtchn(port)); } } - spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->bind_mutex); rc = count; @@ -253,6 +280,8 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { + struct user_evtchn *evtchn; + struct evtchn_close close; int rc = 0; /* @@ -263,35 +292,46 @@ static int evtchn_bind_to_user(struct per_user_data *u, int port) * interrupt handler yet, and our caller has already * serialized bind operations.) */ - BUG_ON(get_port_user(port) != NULL); - set_port_user(port, u); - set_port_enabled(port, true); /* start enabled */ + + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc >= 0) - rc = evtchn_make_refcounted(port); - else { - /* bind failed, should close the port now */ - struct evtchn_close close; - close.port = port; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - set_port_user(port, NULL); - } + u->name, evtchn); + if (rc < 0) + goto err; + rc = evtchn_make_refcounted(port); + return rc; + +err: + /* bind failed, should close the port now */ + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + del_evtchn(u, evtchn); return rc; } -static void evtchn_unbind_from_user(struct per_user_data *u, int port) +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) { - int irq = irq_from_evtchn(port); + int irq = irq_from_evtchn(evtchn->port); BUG_ON(irq < 0); - unbind_from_irqhandler(irq, (void *)(unsigned long)port); + unbind_from_irqhandler(irq, evtchn); - set_port_user(port, NULL); + del_evtchn(u, evtchn); } static long evtchn_ioctl(struct file *file, @@ -370,6 +410,7 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_UNBIND: { struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(&unbind, uarg, sizeof(unbind))) @@ -380,29 +421,27 @@ static long evtchn_ioctl(struct file *file, break; rc = -ENOTCONN; - if (get_port_user(unbind.port) != u) + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) break; disable_irq(irq_from_evtchn(unbind.port)); - - evtchn_unbind_from_user(u, unbind.port); - + evtchn_unbind_from_user(u, evtchn); rc = 0; break; } case IOCTL_EVTCHN_NOTIFY: { struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; rc = -EFAULT; if (copy_from_user(¬ify, uarg, sizeof(notify))) break; - if (notify.port >= NR_EVENT_CHANNELS) { - rc = -EINVAL; - } else if (get_port_user(notify.port) != u) { - rc = -ENOTCONN; - } else { + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { notify_remote_via_evtchn(notify.port); rc = 0; } @@ -412,9 +451,9 @@ static long evtchn_ioctl(struct file *file, case IOCTL_EVTCHN_RESET: { /* Initialise the ring to empty. Clear errors. */ mutex_lock(&u->ring_cons_mutex); - spin_lock_irq(&port_user_lock); + spin_lock_irq(&u->ring_prod_lock); u->ring_cons = u->ring_prod = u->ring_overflow = 0; - spin_unlock_irq(&port_user_lock); + spin_unlock_irq(&u->ring_prod_lock); mutex_unlock(&u->ring_cons_mutex); rc = 0; break; @@ -473,6 +512,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); filp->private_data = u; @@ -481,15 +521,15 @@ static int evtchn_open(struct inode *inode, struct file *filp) static int evtchn_release(struct inode *inode, struct file *filp) { - int i; struct per_user_data *u = filp->private_data; + struct rb_node *node; - for (i = 0; i < NR_EVENT_CHANNELS; i++) { - if (get_port_user(i) != u) - continue; + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; - disable_irq(irq_from_evtchn(i)); - evtchn_unbind_from_user(get_port_user(i), i); + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); } free_page((unsigned long)u->ring); @@ -523,12 +563,6 @@ static int __init evtchn_init(void) if (!xen_domain()) return -ENODEV; - port_user = kcalloc(NR_EVENT_CHANNELS, sizeof(*port_user), GFP_KERNEL); - if (port_user == NULL) - return -ENOMEM; - - spin_lock_init(&port_user_lock); - /* Create '/dev/xen/evtchn'. */ err = misc_register(&evtchn_miscdev); if (err != 0) { @@ -543,9 +577,6 @@ static int __init evtchn_init(void) static void __exit evtchn_cleanup(void) { - kfree(port_user); - port_user = NULL; - misc_deregister(&evtchn_miscdev); } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index eab5427c75f5..e41c79c986ea 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -272,19 +272,12 @@ static int map_grant_pages(struct grant_map *map) * with find_grant_ptes. */ for (i = 0; i < map->count; i++) { - unsigned level; unsigned long address = (unsigned long) pfn_to_kaddr(page_to_pfn(map->pages[i])); - pte_t *ptep; - u64 pte_maddr = 0; BUG_ON(PageHighMem(map->pages[i])); - ptep = lookup_address(address, &level); - pte_maddr = arbitrary_virt_to_machine(ptep).maddr; - gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, - map->flags | - GNTMAP_host_map | - GNTMAP_contains_pte, + gnttab_set_map_op(&map->kmap_ops[i], address, + map->flags | GNTMAP_host_map, map->grants[i].ref, map->grants[i].domid); } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 04cdeb8e3719..c4d2298893b1 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -730,9 +730,18 @@ void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, u16 count) { unsigned long flags; + struct gnttab_free_callback *cb; + spin_lock_irqsave(&gnttab_list_lock, flags); - if (callback->next) - goto out; + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + callback->fn = fn; callback->arg = arg; callback->count = count; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index f8e5dd701ecb..8e74590fa1bb 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -43,9 +43,10 @@ MODULE_LICENSE("GPL"); #define PRIV_VMA_LOCKED ((void *)1) -#ifndef HAVE_ARCH_PRIVCMD_MMAP -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); -#endif +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); static long privcmd_ioctl_hypercall(void __user *udata) { @@ -225,9 +226,9 @@ static long privcmd_ioctl_mmap(void __user *udata) vma = find_vma(mm, msg->va); rc = -EINVAL; - if (!vma || (msg->va != vma->vm_start) || - !privcmd_enforce_singleshot_mapping(vma)) + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; } state.va = vma->vm_start; @@ -358,7 +359,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) kfree(pages); return -ENOMEM; } - BUG_ON(vma->vm_private_data != PRIV_VMA_LOCKED); + BUG_ON(vma->vm_private_data != NULL); vma->vm_private_data = pages; return 0; @@ -421,19 +422,43 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) vma = find_vma(mm, m.addr); if (!vma || - vma->vm_ops != &privcmd_vm_ops || - (m.addr != vma->vm_start) || - ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || - !privcmd_enforce_singleshot_mapping(vma)) { - up_write(&mm->mmap_sem); + vma->vm_ops != &privcmd_vm_ops) { ret = -EINVAL; - goto out; + goto out_unlock; } - if (xen_feature(XENFEAT_auto_translated_physmap)) { - ret = alloc_empty_pages(vma, m.num); - if (ret < 0) { - up_write(&mm->mmap_sem); - goto out; + + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = alloc_empty_pages(vma, m.num); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; } } @@ -466,8 +491,11 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) out: free_page_list(&pagelist); - return ret; + +out_unlock: + up_write(&mm->mmap_sem); + goto out; } static long privcmd_ioctl(struct file *file, @@ -540,9 +568,24 @@ static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) { - return !cmpxchg(&vma->vm_private_data, NULL, PRIV_VMA_LOCKED); + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; } const struct file_operations xen_privcmd_fops = { diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index aadffcf7db9b..1b2277c311d2 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -506,13 +506,13 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, to do proper error handling. */ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); - sgl[0].dma_length = 0; + sg_dma_len(sgl) = 0; return DMA_ERROR_CODE; } sg->dma_address = xen_phys_to_bus(map); } else sg->dma_address = dev_addr; - sg->dma_length = sg->length; + sg_dma_len(sg) = sg->length; } return nelems; } @@ -533,7 +533,7 @@ xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); } EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); @@ -555,7 +555,7 @@ xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, for_each_sg(sgl, sg, nelems, i) xen_swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); + sg_dma_len(sg), dir, target); } void diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 02817a85f877..21e18c18c7a1 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -265,8 +265,10 @@ static ssize_t store_selfballooning(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; xen_selfballooning_enabled = !!tmp; @@ -292,8 +294,10 @@ static ssize_t store_selfballoon_interval(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_interval = val; return count; @@ -314,8 +318,10 @@ static ssize_t store_selfballoon_downhys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_downhysteresis = val; return count; @@ -337,8 +343,10 @@ static ssize_t store_selfballoon_uphys(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_uphysteresis = val; return count; @@ -360,8 +368,10 @@ static ssize_t store_selfballoon_min_usable_mb(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_min_usable_mb = val; return count; @@ -384,8 +394,10 @@ static ssize_t store_selfballoon_reserved_mb(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; selfballoon_reserved_mb = val; return count; @@ -410,8 +422,10 @@ static ssize_t store_frontswap_selfshrinking(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &tmp); - if (err || ((tmp != 0) && (tmp != 1))) + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) return -EINVAL; frontswap_selfshrinking = !!tmp; if (!was_enabled && !xen_selfballooning_enabled && @@ -437,8 +451,10 @@ static ssize_t store_frontswap_inertia(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_inertia = val; frontswap_inertia_counter = val; @@ -460,8 +476,10 @@ static ssize_t store_frontswap_hysteresis(struct device *dev, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = strict_strtoul(buf, 10, &val); - if (err || val == 0) + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) return -EINVAL; frontswap_hysteresis = val; return count; diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 0976fc46d1e0..a5079072da66 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -48,7 +48,6 @@ #include <linux/types.h> #include <linux/compiler.h> -#include <linux/workqueue.h> #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) @@ -61,12 +60,6 @@ struct static_key { #endif }; -struct static_key_deferred { - struct static_key key; - unsigned long timeout; - struct delayed_work work; -}; - # include <asm/jump_label.h> # define HAVE_JUMP_LABEL #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ @@ -78,6 +71,7 @@ enum jump_label_type { struct module; +#include <linux/atomic.h> #ifdef HAVE_JUMP_LABEL #define JUMP_LABEL_TRUE_BRANCH 1UL @@ -119,10 +113,7 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -extern void static_key_slow_dec_deferred(struct static_key_deferred *key); extern void jump_label_apply_nops(struct module *mod); -extern void -jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); #define STATIC_KEY_INIT_TRUE ((struct static_key) \ { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) @@ -131,8 +122,6 @@ jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); #else /* !HAVE_JUMP_LABEL */ -#include <linux/atomic.h> - struct static_key { atomic_t enabled; }; @@ -141,10 +130,6 @@ static __always_inline void jump_label_init(void) { } -struct static_key_deferred { - struct static_key key; -}; - static __always_inline bool static_key_false(struct static_key *key) { if (unlikely(atomic_read(&key->enabled)) > 0) @@ -169,11 +154,6 @@ static inline void static_key_slow_dec(struct static_key *key) atomic_dec(&key->enabled); } -static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) -{ - static_key_slow_dec(&key->key); -} - static inline int jump_label_text_reserved(void *start, void *end) { return 0; @@ -187,12 +167,6 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void -jump_label_rate_limit(struct static_key_deferred *key, - unsigned long rl) -{ -} - #define STATIC_KEY_INIT_TRUE ((struct static_key) \ { .enabled = ATOMIC_INIT(1) }) #define STATIC_KEY_INIT_FALSE ((struct static_key) \ diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h new file mode 100644 index 000000000000..113788389b3d --- /dev/null +++ b/include/linux/jump_label_ratelimit.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_JUMP_LABEL_RATELIMIT_H +#define _LINUX_JUMP_LABEL_RATELIMIT_H + +#include <linux/jump_label.h> +#include <linux/workqueue.h> + +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) +struct static_key_deferred { + struct static_key key; + unsigned long timeout; + struct delayed_work work; +}; +#endif + +#ifdef HAVE_JUMP_LABEL +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#else /* !HAVE_JUMP_LABEL */ +struct static_key_deferred { + struct static_key key; +}; +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) +{ + static_key_slow_dec(&key->key); +} +static inline void +jump_label_rate_limit(struct static_key_deferred *key, + unsigned long rl) +{ +} +#endif /* HAVE_JUMP_LABEL */ +#endif /* _LINUX_JUMP_LABEL_RATELIMIT_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c43f6eabad5b..226be8da3f85 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -48,6 +48,7 @@ struct perf_guest_info_callbacks { #include <linux/cpu.h> #include <linux/irq_work.h> #include <linux/static_key.h> +#include <linux/jump_label_ratelimit.h> #include <linux/atomic.h> #include <linux/sysfs.h> #include <linux/perf_regs.h> diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h index cea2c5c72d26..2841f86eae0b 100644 --- a/include/uapi/linux/kvm_para.h +++ b/include/uapi/linux/kvm_para.h @@ -19,6 +19,7 @@ #define KVM_HC_MMU_OP 2 #define KVM_HC_FEATURES 3 #define KVM_HC_PPC_MAP_MAGIC_PAGE 4 +#define KVM_HC_KICK_CPU 5 /* * hypercalls use architecture specific diff --git a/include/xen/balloon.h b/include/xen/balloon.h index cc2e1a7e44ec..a4c1c6a93691 100644 --- a/include/xen/balloon.h +++ b/include/xen/balloon.h @@ -29,6 +29,9 @@ int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem); void free_xenballooned_pages(int nr_pages, struct page **pages); +struct page *get_balloon_scratch_page(void); +void put_balloon_scratch_page(void); + struct device; #ifdef CONFIG_XEN_SELFBALLOONING extern int register_xen_selfballooning(struct device *dev); diff --git a/include/xen/interface/io/tpmif.h b/include/xen/interface/io/tpmif.h new file mode 100644 index 000000000000..28e7dcd75e82 --- /dev/null +++ b/include/xen/interface/io/tpmif.h @@ -0,0 +1,52 @@ +/****************************************************************************** + * tpmif.h + * + * TPM I/O interface for Xen guest OSes, v2 + * + * This file is in the public domain. + * + */ + +#ifndef __XEN_PUBLIC_IO_TPMIF_H__ +#define __XEN_PUBLIC_IO_TPMIF_H__ + +/* + * Xenbus state machine + * + * Device open: + * 1. Both ends start in XenbusStateInitialising + * 2. Backend transitions to InitWait (frontend does not wait on this step) + * 3. Frontend populates ring-ref, event-channel, feature-protocol-v2 + * 4. Frontend transitions to Initialised + * 5. Backend maps grant and event channel, verifies feature-protocol-v2 + * 6. Backend transitions to Connected + * 7. Frontend verifies feature-protocol-v2, transitions to Connected + * + * Device close: + * 1. State is changed to XenbusStateClosing + * 2. Frontend transitions to Closed + * 3. Backend unmaps grant and event, changes state to InitWait + */ + +enum vtpm_shared_page_state { + VTPM_STATE_IDLE, /* no contents / vTPM idle / cancel complete */ + VTPM_STATE_SUBMIT, /* request ready / vTPM working */ + VTPM_STATE_FINISH, /* response ready / vTPM idle */ + VTPM_STATE_CANCEL, /* cancel requested / vTPM working */ +}; +/* The backend should only change state to IDLE or FINISH, while the + * frontend should only change to SUBMIT or CANCEL. */ + + +struct vtpm_shared_page { + uint32_t length; /* request/response length in bytes */ + + uint8_t state; /* enum vtpm_shared_page_state */ + uint8_t locality; /* for the current request */ + uint8_t pad; + + uint8_t nr_extra_pages; /* extra pages for long packets; may be zero */ + uint32_t extra_pages[0]; /* grant IDs; length in nr_extra_pages */ +}; + +#endif diff --git a/include/xen/interface/vcpu.h b/include/xen/interface/vcpu.h index 87e6f8a48661..b05288ce3991 100644 --- a/include/xen/interface/vcpu.h +++ b/include/xen/interface/vcpu.h @@ -170,4 +170,6 @@ struct vcpu_register_vcpu_info { }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_register_vcpu_info); +/* Send an NMI to the specified VCPU. @extra_arg == NULL. */ +#define VCPUOP_send_nmi 11 #endif /* __XEN_PUBLIC_VCPU_H__ */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0d..297a9247a3b3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -13,6 +13,7 @@ #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> +#include <linux/jump_label_ratelimit.h> #ifdef HAVE_JUMP_LABEL diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d23762e6652c..4e8686c7e5a4 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -870,13 +870,13 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, swiotlb_full(hwdev, sg->length, dir, 0); swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, attrs); - sgl[0].dma_length = 0; + sg_dma_len(sgl) = 0; return 0; } sg->dma_address = phys_to_dma(hwdev, map); } else sg->dma_address = dev_addr; - sg->dma_length = sg->length; + sg_dma_len(sg) = sg->length; } return nelems; } @@ -904,7 +904,7 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir); } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); @@ -934,7 +934,7 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, for_each_sg(sgl, sg, nelems, i) swiotlb_sync_single(hwdev, sg->dma_address, - sg->dma_length, dir, target); + sg_dma_len(sg), dir, target); } void |