summaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 13:14:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 13:14:03 -0700
commit62c4d9afa4bcf5315e2745a17a0228bf65b9ba40 (patch)
treea7b9d97283441ea5f0c738fa388e120c4c1491b6 /arch/x86/xen
parent5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (diff)
parentc3d93f880197953f86ab90d9da4744e926b38e33 (diff)
downloadtalos-obmc-linux-62c4d9afa4bcf5315e2745a17a0228bf65b9ba40.tar.gz
talos-obmc-linux-62c4d9afa4bcf5315e2745a17a0228bf65b9ba40.zip
Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen
Pull Xen update from Konrad Rzeszutek Wilk: "Features: * Performance improvement to lower the amount of traps the hypervisor has to do 32-bit guests. Mainly for setting PTE entries and updating TLS descriptors. * MCE polling driver to collect hypervisor MCE buffer and present them to /dev/mcelog. * Physical CPU online/offline support. When an privileged guest is booted it is present with virtual CPUs, which might have an 1:1 to physical CPUs but usually don't. This provides mechanism to offline/online physical CPUs. Bug-fixes for: * Coverity found fixes in the console and ACPI processor driver. * PVonHVM kexec fixes along with some cleanups. * Pages that fall within E820 gaps and non-RAM regions (and had been released to hypervisor) would be populated back, but potentially in non-RAM regions." * tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen: xen: populate correct number of pages when across mem boundary (v2) xen PVonHVM: move shared_info to MMIO before kexec xen: simplify init_hvm_pv_info xen: remove cast from HYPERVISOR_shared_info assignment xen: enable platform-pci only in a Xen guest xen/pv-on-hvm kexec: shutdown watches from old kernel xen/x86: avoid updating TLS descriptors if they haven't changed xen/x86: add desc_equal() to compare GDT descriptors xen/mm: zero PTEs for non-present MFNs in the initial page table xen/mm: do direct hypercall in xen_set_pte() if batching is unavailable xen/hvc: Fix up checks when the info is allocated. xen/acpi: Fix potential memory leak. xen/mce: add .poll method for mcelog device driver xen/mce: schedule a workqueue to avoid sleep in atomic context xen/pcpu: Xen physical cpus online/offline sys interface xen/mce: Register native mce handler as vMCE bounce back point x86, MCE, AMD: Adjust initcall sequence for xen xen/mce: Add mcelog support for Xen platform
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/enlighten.c224
-rw-r--r--arch/x86/xen/mmu.c39
-rw-r--r--arch/x86/xen/setup.c23
-rw-r--r--arch/x86/xen/suspend.c2
-rw-r--r--arch/x86/xen/xen-ops.h2
5 files changed, 217 insertions, 73 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ed7d54985d0c..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
+#include <linux/syscore_ops.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
@@ -38,6 +39,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
@@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
@@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
+struct tls_descs {
+ struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed. Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
@@ -341,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
- ~((1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
@@ -540,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG();
}
+static inline bool desc_equal(const struct desc_struct *d1,
+ const struct desc_struct *d2)
+{
+ return d1->a == d2->a && d1->b == d2->b;
+}
+
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
- struct multicall_space mc = __xen_mc_entry(0);
+ struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+ struct desc_struct *gdt;
+ xmaddr_t maddr;
+ struct multicall_space mc;
+
+ if (desc_equal(shadow, &t->tls_array[i]))
+ return;
+
+ *shadow = t->tls_array[i];
+
+ gdt = get_cpu_gdt_table(cpu);
+ maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
@@ -627,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
/*
* Look for known traps using IST, and substitute them
* appropriately. The debugger ones are the only ones we care
- * about. Xen will handle faults like double_fault and
- * machine_check, so we should never see them. Warn if
+ * about. Xen will handle faults like double_fault,
+ * so we should never see them. Warn if
* there's an unexpected IST-using fault handler.
*/
if (addr == (unsigned long)debug)
@@ -643,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
return 0;
#ifdef CONFIG_X86_MCE
} else if (addr == (unsigned long)machine_check) {
- return 0;
+ /*
+ * when xen hypervisor inject vMCE to guest,
+ * use native mce handler to handle it
+ */
+ ;
#endif
} else {
/* Some other trap using IST? */
@@ -1437,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
-static int init_hvm_pv_info(int *major, int *minor)
-{
- uint32_t eax, ebx, ecx, edx, pages, msr, base;
- u64 pfn;
-
- base = xen_cpuid_base();
- cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
- *major = eax >> 16;
- *minor = eax & 0xffff;
- printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
-
- cpuid(base + 2, &pages, &msr, &ecx, &edx);
-
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
- xen_setup_features();
-
- pv_info.name = "Xen HVM";
-
- xen_domain_type = XEN_HVM_DOMAIN;
+#ifdef CONFIG_XEN_PVHVM
+/*
+ * The pfn containing the shared_info is located somewhere in RAM. This
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
+ * the pfn is, so it can not reserve the page. The hypervisor will
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
+ *
+ * One way to work around this issue is to allocate a page in the
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
+ * window when the old pfn is moved to the new pfn. There is even a
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
+ *
+ * Because it is not known upfront where the MMIO region is located it
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
- return 0;
-}
+static struct shared_info *xen_hvm_shared_info;
-void __ref xen_hvm_init_shared_info(void)
+static void xen_hvm_connect_shared_info(unsigned long pfn)
{
- int cpu;
struct xen_add_to_physmap xatp;
- static struct shared_info *shared_info_page = 0;
- if (!shared_info_page)
- shared_info_page = (struct shared_info *)
- extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ xatp.gpfn = pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
- HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+ int cpu;
+
+ HYPERVISOR_shared_info = sip;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
* HVM.
- * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
- * online but xen_hvm_init_shared_info is run at resume time too and
+ * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_set_shared_info is run at resume time too and
* in that case multiple vcpus might be online. */
for_each_online_cpu(cpu) {
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
}
-#ifdef CONFIG_XEN_PVHVM
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+ xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+ xen_hvm_shared_info_kexec = sip;
+ xen_hvm_shared_info_pfn_kexec = pfn;
+}
+
+static void xen_hvm_syscore_shutdown(void)
+{
+ struct xen_memory_reservation reservation = {
+ .domid = DOMID_SELF,
+ .nr_extents = 1,
+ };
+ unsigned long prev_pfn;
+ int rc;
+
+ if (!xen_hvm_shared_info_kexec)
+ return;
+
+ prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+ set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+
+ /* Move pfn to MMIO, disconnects previous pfn from mfn */
+ xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+
+ /* Update pointers, following hypercall is also a memory barrier */
+ xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+
+ /* Allocate new mfn for previous pfn */
+ do {
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ if (rc == 0)
+ msleep(123);
+ } while (rc == 0);
+
+ /* Make sure the previous pfn is really connected to a (new) mfn */
+ BUG_ON(rc != 1);
+}
+
+static struct syscore_ops xen_hvm_syscore_ops = {
+ .shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+ /* Remember pointer for resume */
+ xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+ xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+
+static void __init init_hvm_pv_info(void)
+{
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info.name = "Xen HVM";
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+}
+
static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -1517,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
static void __init xen_hvm_guest_init(void)
{
- int r;
- int major, minor;
-
- r = init_hvm_pv_info(&major, &minor);
- if (r < 0)
- return;
+ init_hvm_pv_info();
xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+ register_syscore_ops(&xen_hvm_syscore_ops);
+#endif
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785631ce..27336dfcda8e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
{
- if (!xen_batched_set_pte(ptep, pteval))
- native_set_pte(ptep, pteval);
+ if (!xen_batched_set_pte(ptep, pteval)) {
+ /*
+ * Could call native_set_pte() here and trap and
+ * emulate the PTE write but with 32-bit guests this
+ * needs two traps (one for each of the two 32-bit
+ * words in the PTE) so do one hypercall directly
+ * instead.
+ */
+ struct mmu_update u;
+
+ u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ u.val = pte_val_ma(pteval);
+ HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+ }
}
static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
}
#endif /* CONFIG_X86_64 */
-/* Init-time set_pte while constructing initial pagetables, which
- doesn't allow RO pagetable pages to be remapped RW */
+/*
+ * Init-time set_pte while constructing initial pagetables, which
+ * doesn't allow RO page table pages to be remapped RW.
+ *
+ * If there is no MFN for this PFN then this page is initially
+ * ballooned out so clear the PTE (as in decrease_reservation() in
+ * drivers/xen/balloon.c).
+ *
+ * Many of these PTE updates are done on unpinned and writable pages
+ * and doing a hypercall for these is unnecessary and expensive. At
+ * this point it is not possible to tell if a page is pinned or not,
+ * so always write the PTE directly and rely on Xen trapping and
+ * emulating any updates as necessary.
+ */
static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
{
- pte = mask_rw_pte(ptep, pte);
+ if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+ pte = mask_rw_pte(ptep, pte);
+ else
+ pte = __pte_ma(0);
- xen_set_pte(ptep, pte);
+ native_set_pte(ptep, pte);
}
static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index a4790bf22c59..ead85576d54a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk(
unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
- unsigned long credits = credits_left;
unsigned long s_pfn;
unsigned long e_pfn;
unsigned long pfns;
long capacity;
- if (credits <= 0)
+ if (credits_left <= 0)
break;
if (entry->type != E820_RAM)
continue;
- e_pfn = PFN_UP(entry->addr + entry->size);
+ e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after the xen_start_info->nr_pages */
if (e_pfn <= max_pfn)
continue;
- s_pfn = PFN_DOWN(entry->addr);
+ s_pfn = PFN_UP(entry->addr);
/* If the E820 falls within the nr_pages, we want to start
* at the nr_pages PFN.
* If that would mean going past the E820 entry, skip it
@@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk(
capacity = e_pfn - max_pfn;
dest_pfn = max_pfn;
} else {
- /* last_pfn MUST be within E820_RAM regions */
- if (*last_pfn && e_pfn >= *last_pfn)
- s_pfn = *last_pfn;
capacity = e_pfn - s_pfn;
dest_pfn = s_pfn;
}
- /* If we had filled this E820_RAM entry, go to the next one. */
- if (capacity <= 0)
- continue;
- if (credits > capacity)
- credits = capacity;
+ if (credits_left < capacity)
+ capacity = credits_left;
- pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false);
+ pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
done += pfns;
- credits_left -= pfns;
*last_pfn = (dest_pfn + pfns);
+ if (pfns < capacity)
+ break;
+ credits_left -= pfns;
}
return done;
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 45329c8c226e..ae8a00c39de4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
- xen_hvm_init_shared_info();
+ xen_hvm_resume_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 202d4c150154..1e4329e04e0f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -41,7 +41,7 @@ void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
-void xen_hvm_init_shared_info(void);
+void xen_hvm_resume_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
OpenPOWER on IntegriCloud