diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/debugfs.c | 104 | ||||
-rw-r--r-- | arch/x86/xen/debugfs.h | 4 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 246 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 62 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 140 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 165 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 114 | ||||
-rw-r--r-- | arch/x86/xen/smp.h | 12 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 12 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 3 |
11 files changed, 592 insertions, 272 deletions
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index ef1db1900d86..c8377fb26cdf 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -19,107 +19,3 @@ struct dentry * __init xen_init_debugfs(void) return d_xen_debug; } -struct array_data -{ - void *array; - unsigned elements; -}; - -static int u32_array_open(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - return nonseekable_open(inode, file); -} - -static size_t format_array(char *buf, size_t bufsize, const char *fmt, - u32 *array, unsigned array_size) -{ - size_t ret = 0; - unsigned i; - - for(i = 0; i < array_size; i++) { - size_t len; - - len = snprintf(buf, bufsize, fmt, array[i]); - len++; /* ' ' or '\n' */ - ret += len; - - if (buf) { - buf += len; - bufsize -= len; - buf[-1] = (i == array_size-1) ? '\n' : ' '; - } - } - - ret++; /* \0 */ - if (buf) - *buf = '\0'; - - return ret; -} - -static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) -{ - size_t len = format_array(NULL, 0, fmt, array, array_size); - char *ret; - - ret = kmalloc(len, GFP_KERNEL); - if (ret == NULL) - return NULL; - - format_array(ret, len, fmt, array, array_size); - return ret; -} - -static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, - loff_t *ppos) -{ - struct inode *inode = file->f_path.dentry->d_inode; - struct array_data *data = inode->i_private; - size_t size; - - if (*ppos == 0) { - if (file->private_data) { - kfree(file->private_data); - file->private_data = NULL; - } - - file->private_data = format_array_alloc("%u", data->array, data->elements); - } - - size = 0; - if (file->private_data) - size = strlen(file->private_data); - - return simple_read_from_buffer(buf, len, ppos, file->private_data, size); -} - -static int xen_array_release(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - - return 0; -} - -static const struct file_operations u32_array_fops = { - .owner = THIS_MODULE, - .open = u32_array_open, - .release= xen_array_release, - .read = u32_array_read, - .llseek = no_llseek, -}; - -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements) -{ - struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); - - if (data == NULL) - return NULL; - - data->array = array; - data->elements = elements; - - return debugfs_create_file(name, mode, parent, data, &u32_array_fops); -} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h index 78d25499be5b..12ebf3325c7b 100644 --- a/arch/x86/xen/debugfs.h +++ b/arch/x86/xen/debugfs.h @@ -3,8 +3,4 @@ struct dentry * __init xen_init_debugfs(void); -struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode, - struct dentry *parent, - u32 *array, unsigned elements); - #endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0f5facdb10c..bf4bda6d3e9a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,6 +31,7 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/syscore_ops.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -38,10 +39,12 @@ #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> #include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> #include <xen/hvc-console.h> +#include <xen/acpi.h> #include <asm/paravirt.h> #include <asm/apic.h> @@ -75,6 +78,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "smp.h" #include "multicalls.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -105,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -122,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -207,6 +224,9 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0 + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; @@ -240,6 +260,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *dx = cpuid_leaf5_edx_val; return; + case CPUID_THERM_POWER_LEAF: + /* Disabling APERFMPERF for kernel usage */ + maskecx = ~(1 << APERFMPERF_PRESENT); + break; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; @@ -331,9 +356,7 @@ static void __init xen_init_cpuid_mask(void) unsigned int xsave_mask; cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_MTRR) | /* disable MTRR */ + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) @@ -530,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG(); } +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_table(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); } @@ -617,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, /* * Look for known traps using IST, and substitute them * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault and - * machine_check, so we should never see them. Warn if + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if * there's an unexpected IST-using fault handler. */ if (addr == (unsigned long)debug) @@ -633,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, return 0; #ifdef CONFIG_X86_MCE } else if (addr == (unsigned long)machine_check) { - return 0; + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; #endif } else { /* Some other trap using IST? */ @@ -883,6 +926,14 @@ static void set_xen_basic_apic_ops(void) apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle; apic->set_apic_id = xen_set_apic_id; apic->get_apic_id = xen_get_apic_id; + +#ifdef CONFIG_SMP + apic->send_IPI_allbutself = xen_send_IPI_allbutself; + apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself; + apic->send_IPI_mask = xen_send_IPI_mask; + apic->send_IPI_all = xen_send_IPI_all; + apic->send_IPI_self = xen_send_IPI_self; +#endif } #endif @@ -1107,6 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = native_read_msr_safe, .write_msr = xen_write_msr_safe, + .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, @@ -1340,7 +1392,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); - xen_ident_map_ISA(); /* Allocate and initialize top and mid mfn levels for p2m structure */ xen_build_mfn_list_list(); @@ -1400,6 +1451,8 @@ asmlinkage void __init xen_start_kernel(void) /* Make sure ACS will be enabled */ pci_request_acs(); + + xen_acpi_sleep_register(); } #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ @@ -1417,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void) #endif } -static int init_hvm_pv_info(int *major, int *minor) -{ - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; - - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - *major = eax >> 16; - *minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - - cpuid(base + 2, &pages, &msr, &ecx, &edx); - - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - xen_setup_features(); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; +#ifdef CONFIG_XEN_PVHVM +/* + * The pfn containing the shared_info is located somewhere in RAM. This + * will cause trouble if the current kernel is doing a kexec boot into a + * new kernel. The new kernel (and its startup code) can not know where + * the pfn is, so it can not reserve the page. The hypervisor will + * continue to update the pfn, and as a result memory corruption occours + * in the new kernel. + * + * One way to work around this issue is to allocate a page in the + * xen-platform pci device's BAR memory range. But pci init is done very + * late and the shared_info page is already in use very early to read + * the pvclock. So moving the pfn from RAM to MMIO is racy because some + * code paths on other vcpus could access the pfn during the small + * window when the old pfn is moved to the new pfn. There is even a + * small window were the old pfn is not backed by a mfn, and during that + * time all reads return -1. + * + * Because it is not known upfront where the MMIO region is located it + * can not be used right from the start in xen_hvm_init_shared_info. + * + * To minimise trouble the move of the pfn is done shortly before kexec. + * This does not eliminate the race because all vcpus are still online + * when the syscore_ops will be called. But hopefully there is no work + * pending at this point in time. Also the syscore_op is run last which + * reduces the risk further. + */ - return 0; -} +static struct shared_info *xen_hvm_shared_info; -void __ref xen_hvm_init_shared_info(void) +static void xen_hvm_connect_shared_info(unsigned long pfn) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +} +static void xen_hvm_set_shared_info(struct shared_info *sip) +{ + int cpu; + + HYPERVISOR_shared_info = sip; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and + * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_set_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -#ifdef CONFIG_XEN_PVHVM +/* Reconnect the shared_info pfn to a mfn */ +void xen_hvm_resume_shared_info(void) +{ + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); +} + +#ifdef CONFIG_KEXEC +static struct shared_info *xen_hvm_shared_info_kexec; +static unsigned long xen_hvm_shared_info_pfn_kexec; + +/* Remember a pfn in MMIO space for kexec reboot */ +void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) +{ + xen_hvm_shared_info_kexec = sip; + xen_hvm_shared_info_pfn_kexec = pfn; +} + +static void xen_hvm_syscore_shutdown(void) +{ + struct xen_memory_reservation reservation = { + .domid = DOMID_SELF, + .nr_extents = 1, + }; + unsigned long prev_pfn; + int rc; + + if (!xen_hvm_shared_info_kexec) + return; + + prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; + set_xen_guest_handle(reservation.extent_start, &prev_pfn); + + /* Move pfn to MMIO, disconnects previous pfn from mfn */ + xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); + + /* Update pointers, following hypercall is also a memory barrier */ + xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); + + /* Allocate new mfn for previous pfn */ + do { + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc == 0) + msleep(123); + } while (rc == 0); + + /* Make sure the previous pfn is really connected to a (new) mfn */ + BUG_ON(rc != 1); +} + +static struct syscore_ops xen_hvm_syscore_ops = { + .shutdown = xen_hvm_syscore_shutdown, +}; +#endif + +/* Use a pfn in RAM, may move to MMIO before kexec. */ +static void __init xen_hvm_init_shared_info(void) +{ + /* Remember pointer for resume */ + xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); + xen_hvm_set_shared_info(xen_hvm_shared_info); +} + +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info.name = "Xen HVM"; + + xen_domain_type = XEN_HVM_DOMAIN; +} + static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1497,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { static void __init xen_hvm_guest_init(void) { - int r; - int major, minor; - - r = init_hvm_pv_info(&major, &minor); - if (r < 0) - return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); +#ifdef CONFIG_KEXEC + register_syscore_ops(&xen_hvm_syscore_ops); +#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3506cd4f9a43..27336dfcda8e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { - if (!xen_batched_set_pte(ptep, pteval)) - native_set_pte(ptep, pteval); + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } } static void xen_set_pte(pte_t *ptep, pte_t pteval) @@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) } #endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { - pte = mask_rw_pte(ptep, pte); + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); - xen_set_pte(ptep, pte); + native_set_pte(ptep, pte); } static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) @@ -1933,29 +1960,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } -void __init xen_ident_map_ISA(void) -{ - unsigned long pa; - - /* - * If we're dom0, then linear map the ISA machine addresses into - * the kernel's address space. - */ - if (!xen_initial_domain()) - return; - - xen_raw_printk("Xen: setup ISA identity maps\n"); - - for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { - pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); - - if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) - BUG(); - } - - xen_flush_tlb(); -} - static void __init xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 1b267e75158d..64effdc6da94 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -499,16 +499,18 @@ static bool alloc_p2m(unsigned long pfn) return true; } -static bool __init __early_alloc_p2m(unsigned long pfn) +static bool __init early_alloc_p2m_middle(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; + unsigned long *p2m; + unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); /* Pfff.. No boundary cross-over, lets get out. */ - if (!idx) + if (!idx && check_boundary) return false; WARN(p2m_top[topidx][mididx] == p2m_identity, @@ -522,24 +524,66 @@ static bool __init __early_alloc_p2m(unsigned long pfn) return false; /* Boundary cross-over for the edges: */ - if (idx) { - unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - unsigned long *mid_mfn_p; + p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_init(p2m); + p2m_init(p2m); - p2m_top[topidx][mididx] = p2m; + p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ - - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); + /* For save/restore we need to MFN of the P2M saved */ + + mid_mfn_p = p2m_top_mfn_p[topidx]; + WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), + "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", + topidx, mididx); + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} + +static bool __init early_alloc_p2m(unsigned long pfn) +{ + unsigned topidx = p2m_top_index(pfn); + unsigned long *mid_mfn_p; + unsigned long **mid; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + if (mid == p2m_mid_missing) { + mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + + p2m_mid_init(mid); + p2m_top[topidx] = mid; + + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + } + /* And the save/restore P2M tables.. */ + if (mid_mfn_p == p2m_mid_missing_mfn) { + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + /* Note: we don't set mid_mfn_p[midix] here, + * look in early_alloc_p2m_middle */ } - return idx != 0; + return true; +} +bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + if (!early_alloc_p2m(pfn)) + return false; + + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) + return false; + + if (!__set_phys_to_machine(pfn, mfn)) + return false; + } + + return true; } unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) @@ -559,35 +603,11 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s, pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE)); pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; - unsigned long **mid; - - mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; - if (mid == p2m_mid_missing) { - mid = extend_brk(PAGE_SIZE, PAGE_SIZE); - - p2m_mid_init(mid); - - p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in __early_alloc_p2m */ - } + WARN_ON(!early_alloc_p2m(pfn)); } - __early_alloc_p2m(pfn_s); - __early_alloc_p2m(pfn_e); + early_alloc_p2m_middle(pfn_s, true); + early_alloc_p2m_middle(pfn_e, true); for (pfn = pfn_s; pfn < pfn_e; pfn++) if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) @@ -686,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -721,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page, list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); + /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in + * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other + * pfn so that the following mfn_to_pfn(mfn) calls will return the + * pfn from the m2p_override (the backend pfn) instead. + * We need to do this because the pages shared by the frontend + * (xen-blkfront) can be already locked (lock_page, called by + * do_read_cache_page); when the userspace backend tries to use them + * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so + * do_blockdev_direct_IO is going to try to lock the same pages + * again resulting in a deadlock. + * As a side effect get_user_pages_fast might not be safe on the + * frontend pages while they are being shared with the backend, + * because mfn_to_pfn (that ends up being called by GUPF) will + * return the backend pfn rather than the frontend pfn. */ + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == mfn) + set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); @@ -732,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte) unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -801,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte) } else set_phys_to_machine(pfn, page->index); + /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present + * somewhere in this domain, even before being added to the + * m2p_override (see comment above in m2p_add_override). + * If there are no other entries in the m2p_override corresponding + * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for + * the original pfn (the one shared by the frontend): the backend + * cannot do any IO on this page anymore because it has been + * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of + * the original pfn causes mfn_to_pfn(mfn) to return the frontend + * pfn again. */ + mfn &= ~FOREIGN_FRAME_BIT; + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + m2p_find_override(mfn) == NULL) + set_phys_to_machine(pfn, mfn); + return 0; } EXPORT_SYMBOL_GPL(m2p_remove_override); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1ba8dff26753..ead85576d54a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -26,7 +26,6 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> - #include "xen-ops.h" #include "vdso.h" @@ -84,8 +83,8 @@ static void __init xen_add_extra_mem(u64 start, u64 size) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } -static unsigned long __init xen_release_chunk(unsigned long start, - unsigned long end) +static unsigned long __init xen_do_chunk(unsigned long start, + unsigned long end, bool release) { struct xen_memory_reservation reservation = { .address_bits = 0, @@ -96,30 +95,133 @@ static unsigned long __init xen_release_chunk(unsigned long start, unsigned long pfn; int ret; - for(pfn = start; pfn < end; pfn++) { + for (pfn = start; pfn < end; pfn++) { + unsigned long frame; unsigned long mfn = pfn_to_mfn(pfn); - /* Make sure pfn exists to start with */ - if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) - continue; - - set_xen_guest_handle(reservation.extent_start, &mfn); + if (release) { + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + frame = mfn; + } else { + if (mfn != INVALID_P2M_ENTRY) + continue; + frame = pfn; + } + set_xen_guest_handle(reservation.extent_start, &frame); reservation.nr_extents = 1; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, &reservation); - WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); + WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", + release ? "release" : "populate", pfn, ret); + if (ret == 1) { - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { + if (release) + break; + set_xen_guest_handle(reservation.extent_start, &frame); + reservation.nr_extents = 1; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + break; + } len++; - } + } else + break; } - printk(KERN_INFO "Freeing %lx-%lx pfn range: %lu pages freed\n", - start, end, len); + if (len) + printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", + release ? "Freeing" : "Populating", + start, end, len, + release ? "freed" : "added"); return len; } +static unsigned long __init xen_release_chunk(unsigned long start, + unsigned long end) +{ + return xen_do_chunk(start, end, true); +} + +static unsigned long __init xen_populate_chunk( + const struct e820entry *list, size_t map_size, + unsigned long max_pfn, unsigned long *last_pfn, + unsigned long credits_left) +{ + const struct e820entry *entry; + unsigned int i; + unsigned long done = 0; + unsigned long dest_pfn; + + for (i = 0, entry = list; i < map_size; i++, entry++) { + unsigned long s_pfn; + unsigned long e_pfn; + unsigned long pfns; + long capacity; + + if (credits_left <= 0) + break; + + if (entry->type != E820_RAM) + continue; + + e_pfn = PFN_DOWN(entry->addr + entry->size); + + /* We only care about E820 after the xen_start_info->nr_pages */ + if (e_pfn <= max_pfn) + continue; + + s_pfn = PFN_UP(entry->addr); + /* If the E820 falls within the nr_pages, we want to start + * at the nr_pages PFN. + * If that would mean going past the E820 entry, skip it + */ + if (s_pfn <= max_pfn) { + capacity = e_pfn - max_pfn; + dest_pfn = max_pfn; + } else { + capacity = e_pfn - s_pfn; + dest_pfn = s_pfn; + } + + if (credits_left < capacity) + capacity = credits_left; + + pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); + done += pfns; + *last_pfn = (dest_pfn + pfns); + if (pfns < capacity) + break; + credits_left -= pfns; + } + return done; +} + +static void __init xen_set_identity_and_release_chunk( + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long *released, unsigned long *identity) +{ + unsigned long pfn; + + /* + * If the PFNs are currently mapped, the VA mapping also needs + * to be updated to be 1:1. + */ + for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) + (void)HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(pfn, PAGE_KERNEL_IO), 0); + + if (start_pfn < nr_pages) + *released += xen_release_chunk( + start_pfn, min(end_pfn, nr_pages)); + + *identity += set_phys_range_identity(start_pfn, end_pfn); +} + static unsigned long __init xen_set_identity_and_release( const struct e820entry *list, size_t map_size, unsigned long nr_pages) { @@ -142,7 +244,6 @@ static unsigned long __init xen_set_identity_and_release( */ for (i = 0, entry = list; i < map_size; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -150,20 +251,19 @@ static unsigned long __init xen_set_identity_and_release( if (entry->type == E820_RAM) end_pfn = PFN_UP(entry->addr); - if (start_pfn < end_pfn) { - if (start_pfn < nr_pages) - released += xen_release_chunk( - start_pfn, min(end_pfn, nr_pages)); + if (start_pfn < end_pfn) + xen_set_identity_and_release_chunk( + start_pfn, end_pfn, nr_pages, + &released, &identity); - identity += set_phys_range_identity( - start_pfn, end_pfn); - } start = end; } } - printk(KERN_INFO "Released %lu pages of unused memory\n", released); - printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); + if (released) + printk(KERN_INFO "Released %lu pages of unused memory\n", released); + if (identity) + printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity); return released; } @@ -217,7 +317,9 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long max_pages; + unsigned long last_pfn = 0; unsigned long extra_pages = 0; + unsigned long populated; int i; int op; @@ -257,8 +359,20 @@ char * __init xen_memory_setup(void) */ xen_released_pages = xen_set_identity_and_release( map, memmap.nr_entries, max_pfn); + + /* + * Populate back the non-RAM pages and E820 gaps that had been + * released. */ + populated = xen_populate_chunk(map, memmap.nr_entries, + max_pfn, &last_pfn, xen_released_pages); + + xen_released_pages -= populated; extra_pages += xen_released_pages; + if (last_pfn > max_pfn) { + max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); + mem_end = PFN_PHYS(max_pfn); + } /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. On non-highmem systems, the base @@ -272,7 +386,6 @@ char * __init xen_memory_setup(void) */ extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages); - i = 0; while (i < memmap.nr_entries) { u64 addr = map[i].addr; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 3700945ed0d5..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/smp.h> +#include <linux/irq_work.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -41,10 +42,12 @@ cpumask_var_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, xen_resched_irq); static DEFINE_PER_CPU(int, xen_callfunc_irq); static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_irq_work); static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); /* * Reschedule call back. @@ -77,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); @@ -143,6 +144,17 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(xen_callfuncsingle_irq, cpu) = rc; + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, + cpu, + xen_irq_work_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + callfunc_name, + NULL); + if (rc < 0) + goto fail; + per_cpu(xen_irq_work, cpu) = rc; + return 0; fail: @@ -155,6 +167,8 @@ static int xen_smp_intr_init(unsigned int cpu) if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_irq_work, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); return rc; } @@ -407,6 +421,7 @@ static void xen_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); @@ -469,8 +484,8 @@ static void xen_smp_send_reschedule(int cpu) xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } -static void xen_send_IPI_mask(const struct cpumask *mask, - enum ipi_vector vector) +static void __xen_send_IPI_mask(const struct cpumask *mask, + int vector) { unsigned cpu; @@ -482,7 +497,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) { int cpu; - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); + __xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); /* Make sure other vcpus get a chance to run if they need to. */ for_each_cpu(cpu, mask) { @@ -495,10 +510,86 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) static void xen_smp_send_call_function_single_ipi(int cpu) { - xen_send_IPI_mask(cpumask_of(cpu), + __xen_send_IPI_mask(cpumask_of(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } +static inline int xen_map_vector(int vector) +{ + int xen_vector; + + switch (vector) { + case RESCHEDULE_VECTOR: + xen_vector = XEN_RESCHEDULE_VECTOR; + break; + case CALL_FUNCTION_VECTOR: + xen_vector = XEN_CALL_FUNCTION_VECTOR; + break; + case CALL_FUNCTION_SINGLE_VECTOR: + xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR; + break; + case IRQ_WORK_VECTOR: + xen_vector = XEN_IRQ_WORK_VECTOR; + break; + default: + xen_vector = -1; + printk(KERN_ERR "xen: vector 0x%x is not implemented\n", + vector); + } + + return xen_vector; +} + +void xen_send_IPI_mask(const struct cpumask *mask, + int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(mask, xen_vector); +} + +void xen_send_IPI_all(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + __xen_send_IPI_mask(cpu_online_mask, xen_vector); +} + +void xen_send_IPI_self(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_one(smp_processor_id(), xen_vector); +} + +void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned cpu; + unsigned int this_cpu = smp_processor_id(); + + if (!(num_online_cpus() > 1)) + return; + + for_each_cpu_and(cpu, mask, cpu_online_mask) { + if (this_cpu == cpu) + continue; + + xen_smp_send_call_function_single_ipi(cpu); + } +} + +void xen_send_IPI_allbutself(int vector) +{ + int xen_vector = xen_map_vector(vector); + + if (xen_vector >= 0) + xen_send_IPI_mask_allbutself(cpu_online_mask, xen_vector); +} + static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); @@ -519,6 +610,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) +{ + irq_enter(); + irq_work_run(); + inc_irq_stat(apic_irq_work_irqs); + irq_exit(); + + return IRQ_HANDLED; +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -565,6 +666,7 @@ static void xen_hvm_cpu_die(unsigned int cpu) unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL); native_cpu_die(cpu); } diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h new file mode 100644 index 000000000000..8981a76d081a --- /dev/null +++ b/arch/x86/xen/smp.h @@ -0,0 +1,12 @@ +#ifndef _XEN_SMP_H + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void physflat_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +#endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d69cc6c3f808..83e866d714ce 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -440,12 +440,12 @@ static int __init xen_spinlock_debugfs(void) debugfs_create_u64("time_total", 0444, d_spin_debug, &spinlock_stats.time_total); - xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); - xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226e..ae8a00c39de4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + xen_hvm_resume_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 45c0c0667bd9..1e4329e04e0f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -28,7 +28,6 @@ void xen_setup_shared_info(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); -void xen_ident_map_ISA(void); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; @@ -42,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_init_shared_info(void); +void xen_hvm_resume_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |