From b0f4c062fb6dd4c02b1fe6de73319ed50a09b27d Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 23 Aug 2010 17:05:57 -0700 Subject: x86, paravirt: Remove alloc_pmd_clone hook, only used by VMI VMI was the only user of the alloc_pmd_clone hook, given that VMI is now removed we can also remove this hook. Signed-off-by: Alok N Kataria LKML-Reference: <1282608357.19396.36.camel@ank32.eng.vmware.com> Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406af..b2363fcbcd0f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, .alloc_pmd = xen_alloc_pmd_init, - .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, #ifdef CONFIG_X86_64 -- cgit v1.2.1 From 5cb3a267939a223eb84692d229569d2ef493d7ca Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Aug 2010 13:58:01 -0400 Subject: x86, xen-swiotlb: Make Xen-SWIOTLB use IOMMU_INIT_* macros. We utilize the IOMMU_INIT macros to create this dependency: [null] | [pci_xen_swiotlb_detect] | [pci_swiotlb_detect_override] | [pci_swiotlb_detect_4gb] In other words, we set 'pci_xen_swiotlb_detect' to be the first detection to be run during start. CC: Fujita Tomonori Cc: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk LKML-Reference: <1282845485-8991-7-git-send-email-konrad.wilk@oracle.com> Signed-off-by: H. Peter Anvin --- arch/x86/xen/pci-swiotlb-xen.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index a013ec9d0c54..22471001b74c 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -5,6 +5,7 @@ #include #include +#include int xen_swiotlb __read_mostly; @@ -56,3 +57,7 @@ void __init pci_xen_swiotlb_init(void) dma_ops = &xen_swiotlb_dma_ops; } } +IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, + 0, + pci_xen_swiotlb_init, + 0); -- cgit v1.2.1 From a9ce6bc15100023b411f8117e53a016d61889800 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 25 Aug 2010 13:39:17 -0700 Subject: x86, memblock: Replace e820_/_early string with memblock_ 1.include linux/memblock.h directly. so later could reduce e820.h reference. 2 this patch is done by sed scripts mainly -v2: use MEMBLOCK_ERROR instead of -1ULL or -1UL Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 5 +++-- arch/x86/xen/setup.c | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..b511f1986911 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -1735,7 +1736,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, __xen_write_cr3(true, __pa(pgd)); xen_mc_issue(PARAVIRT_LAZY_CPU); - reserve_early(__pa(xen_start_info->pt_base), + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); @@ -1773,7 +1774,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - reserve_early(__pa(xen_start_info->pt_base), + memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ad0047f47cd4..2ac8f29f89cb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -61,7 +62,7 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in */ - reserve_early(__pa(xen_start_info->mfn_list), + memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), __pa(xen_start_info->pt_base), "XEN START INFO"); -- cgit v1.2.1 From df9ee29270c11dba7d0fe0b83ce47a4d8e8d2101 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 7 Oct 2010 14:08:55 +0100 Subject: Fix IRQ flag handling naming Fix the IRQ flag handling naming. In linux/irqflags.h under one configuration, it maps: local_irq_enable() -> raw_local_irq_enable() local_irq_disable() -> raw_local_irq_disable() local_irq_save() -> raw_local_irq_save() ... and under the other configuration, it maps: raw_local_irq_enable() -> local_irq_enable() raw_local_irq_disable() -> local_irq_disable() raw_local_irq_save() -> local_irq_save() ... This is quite confusing. There should be one set of names expected of the arch, and this should be wrapped to give another set of names that are expected by users of this facility. Change this to have the arch provide: flags = arch_local_save_flags() flags = arch_local_irq_save() arch_local_irq_restore(flags) arch_local_irq_disable() arch_local_irq_enable() arch_irqs_disabled_flags(flags) arch_irqs_disabled() arch_safe_halt() Then linux/irqflags.h wraps these to provide: raw_local_save_flags(flags) raw_local_irq_save(flags) raw_local_irq_restore(flags) raw_local_irq_disable() raw_local_irq_enable() raw_irqs_disabled_flags(flags) raw_irqs_disabled() raw_safe_halt() with type checking on the flags 'arguments', and then wraps those to provide: local_save_flags(flags) local_irq_save(flags) local_irq_restore(flags) local_irq_disable() local_irq_enable() irqs_disabled_flags(flags) irqs_disabled() safe_halt() with tracing included if enabled. The arch functions can now all be inline functions rather than some of them having to be macros. Signed-off-by: David Howells [X86, FRV, MN10300] Signed-off-by: Chris Metcalf [Tile] Signed-off-by: Michal Simek [Microblaze] Tested-by: Catalin Marinas [ARM] Acked-by: Thomas Gleixner Acked-by: Haavard Skinnemoen [AVR] Acked-by: Tony Luck [IA-64] Acked-by: Hirokazu Takata [M32R] Acked-by: Greg Ungerer [M68K/M68KNOMMU] Acked-by: Ralf Baechle [MIPS] Acked-by: Kyle McMartin [PA-RISC] Acked-by: Paul Mackerras [PowerPC] Acked-by: Martin Schwidefsky [S390] Acked-by: Chen Liqin [Score] Acked-by: Matt Fleming [SH] Acked-by: David S. Miller [Sparc] Acked-by: Chris Zankel [Xtensa] Reviewed-by: Richard Henderson [Alpha] Reviewed-by: Yoshinori Sato [H8300] Cc: starvik@axis.com [CRIS] Cc: jesper.nilsson@axis.com [CRIS] Cc: linux-cris-kernel@axis.com --- arch/x86/xen/spinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e0500646585d..23e061b9327b 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab goto out; } - flags = __raw_local_save_flags(); + flags = arch_local_save_flags(); if (irq_enable) { ADD_STATS(taken_slow_irqenable, 1); raw_local_irq_enable(); -- cgit v1.2.1 From 236260b90dd94516982ad67aa6f5449c4c37db7b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 6 Oct 2010 15:52:29 -0700 Subject: memblock: Allow memblock_init to be called early The Xen setup code needs to call memblock_x86_reserve_range() very early, so allow it to initialize the memblock subsystem before doing so. The second memblock_init() is ignored. Signed-off-by: Jeremy Fitzhardinge Cc: Yinghai Lu Cc: Benjamin Herrenschmidt LKML-Reference: <4CACFDAD.3090900@goop.org> Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7d46c8441418..63b83ceebd1a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1183,6 +1184,8 @@ asmlinkage void __init xen_start_kernel(void) local_irq_disable(); early_boot_irqs_off(); + memblock_init(); + xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); -- cgit v1.2.1 From fef5ba797991f9335bcfc295942b684f9bf613a1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 13 Oct 2010 16:02:24 -0700 Subject: xen: Cope with unmapped pages when initializing kernel pagetable Xen requires that all pages containing pagetable entries to be mapped read-only. If pages used for the initial pagetable are already mapped then we can change the mapping to RO. However, if they are initially unmapped, we need to make sure that when they are later mapped, they are also mapped RO. We do this by knowing that the kernel pagetable memory is pre-allocated in the range e820_table_start - e820_table_end, so any pfn within this range should be mapped read-only. However, the pagetable setup code early_ioremaps the pages to write their entries, so we must make sure that mappings created in the early_ioremap fixmap area are mapped RW. (Those mappings are removed before the pages are presented to Xen as pagetable pages.) Signed-off-by: Jeremy Fitzhardinge LKML-Reference: <4CB63A80.8060702@goop.org> Cc: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4fe04ac0bae0..7d55e9ee3a76 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -360,7 +361,8 @@ void make_lowmem_page_readonly(void *vaddr) unsigned int level; pte = lookup_address(address, &level); - BUG_ON(pte == NULL); + if (pte == NULL) + return; /* vaddr missing */ ptev = pte_wrprotect(*pte); @@ -375,7 +377,8 @@ void make_lowmem_page_readwrite(void *vaddr) unsigned int level; pte = lookup_address(address, &level); - BUG_ON(pte == NULL); + if (pte == NULL) + return; /* vaddr missing */ ptev = pte_mkwrite(*pte); @@ -1509,13 +1512,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn = pte_pfn(pte); + +#ifdef CONFIG_X86_32 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & pte_val_ma(pte)); +#endif + + /* + * If the new pfn is within the range of the newly allocated + * kernel pagetable, and it isn't being mapped into an + * early_ioremap fixmap slot, make sure it is RO. + */ + if (!is_early_ioremap_ptep(ptep) && + pfn >= e820_table_start && pfn < e820_table_end) + pte = pte_wrprotect(pte); return pte; } @@ -1528,7 +1543,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } -#endif static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) { @@ -1973,11 +1987,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_X86_64 - .set_pte = xen_set_pte, -#else .set_pte = xen_set_pte_init, -#endif .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, -- cgit v1.2.1 From 6038f373a3dc1f1c26496e60b6c40b164716f07e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 15 Aug 2010 18:52:59 +0200 Subject: llseek: automatically add .llseek fop All file_operations should get a .llseek operation so we can make nonseekable_open the default for future file operations without a .llseek pointer. The three cases that we can automatically detect are no_llseek, seq_lseek and default_llseek. For cases where we can we can automatically prove that the file offset is always ignored, we use noop_llseek, which maintains the current behavior of not returning an error from a seek. New drivers should normally not use noop_llseek but instead use no_llseek and call nonseekable_open at open time. Existing drivers can be converted to do the same when the maintainer knows for certain that no user code relies on calling seek on the device file. The generated code is often incorrectly indented and right now contains comments that clarify for each added line why a specific variant was chosen. In the version that gets submitted upstream, the comments will be gone and I will manually fix the indentation, because there does not seem to be a way to do that using coccinelle. Some amount of new code is currently sitting in linux-next that should get the same modifications, which I will do at the end of the merge window. Many thanks to Julia Lawall for helping me learn to write a semantic patch that does all this. ===== begin semantic patch ===== // This adds an llseek= method to all file operations, // as a preparation for making no_llseek the default. // // The rules are // - use no_llseek explicitly if we do nonseekable_open // - use seq_lseek for sequential files // - use default_llseek if we know we access f_pos // - use noop_llseek if we know we don't access f_pos, // but we still want to allow users to call lseek // @ open1 exists @ identifier nested_open; @@ nested_open(...) { <+... nonseekable_open(...) ...+> } @ open exists@ identifier open_f; identifier i, f; identifier open1.nested_open; @@ int open_f(struct inode *i, struct file *f) { <+... ( nonseekable_open(...) | nested_open(...) ) ...+> } @ read disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ read_no_fpos disable optional_qualifier exists @ identifier read_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t read_f(struct file *f, char *p, size_t s, loff_t *off) { ... when != off } @ write @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; expression E; identifier func; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { <+... ( *off = E | *off += E | func(..., off, ...) | E = *off ) ...+> } @ write_no_fpos @ identifier write_f; identifier f, p, s, off; type ssize_t, size_t, loff_t; @@ ssize_t write_f(struct file *f, const char *p, size_t s, loff_t *off) { ... when != off } @ fops0 @ identifier fops; @@ struct file_operations fops = { ... }; @ has_llseek depends on fops0 @ identifier fops0.fops; identifier llseek_f; @@ struct file_operations fops = { ... .llseek = llseek_f, ... }; @ has_read depends on fops0 @ identifier fops0.fops; identifier read_f; @@ struct file_operations fops = { ... .read = read_f, ... }; @ has_write depends on fops0 @ identifier fops0.fops; identifier write_f; @@ struct file_operations fops = { ... .write = write_f, ... }; @ has_open depends on fops0 @ identifier fops0.fops; identifier open_f; @@ struct file_operations fops = { ... .open = open_f, ... }; // use no_llseek if we call nonseekable_open //////////////////////////////////////////// @ nonseekable1 depends on !has_llseek && has_open @ identifier fops0.fops; identifier nso ~= "nonseekable_open"; @@ struct file_operations fops = { ... .open = nso, ... +.llseek = no_llseek, /* nonseekable */ }; @ nonseekable2 depends on !has_llseek @ identifier fops0.fops; identifier open.open_f; @@ struct file_operations fops = { ... .open = open_f, ... +.llseek = no_llseek, /* open uses nonseekable */ }; // use seq_lseek for sequential files ///////////////////////////////////// @ seq depends on !has_llseek @ identifier fops0.fops; identifier sr ~= "seq_read"; @@ struct file_operations fops = { ... .read = sr, ... +.llseek = seq_lseek, /* we have seq_read */ }; // use default_llseek if there is a readdir /////////////////////////////////////////// @ fops1 depends on !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier readdir_e; @@ // any other fop is used that changes pos struct file_operations fops = { ... .readdir = readdir_e, ... +.llseek = default_llseek, /* readdir is present */ }; // use default_llseek if at least one of read/write touches f_pos ///////////////////////////////////////////////////////////////// @ fops2 depends on !fops1 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read.read_f; @@ // read fops use offset struct file_operations fops = { ... .read = read_f, ... +.llseek = default_llseek, /* read accesses f_pos */ }; @ fops3 depends on !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, ... + .llseek = default_llseek, /* write accesses f_pos */ }; // Use noop_llseek if neither read nor write accesses f_pos /////////////////////////////////////////////////////////// @ fops4 depends on !fops1 && !fops2 && !fops3 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; identifier write_no_fpos.write_f; @@ // write fops use offset struct file_operations fops = { ... .write = write_f, .read = read_f, ... +.llseek = noop_llseek, /* read and write both use no f_pos */ }; @ depends on has_write && !has_read && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier write_no_fpos.write_f; @@ struct file_operations fops = { ... .write = write_f, ... +.llseek = noop_llseek, /* write uses no f_pos */ }; @ depends on has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; identifier read_no_fpos.read_f; @@ struct file_operations fops = { ... .read = read_f, ... +.llseek = noop_llseek, /* read uses no f_pos */ }; @ depends on !has_read && !has_write && !fops1 && !fops2 && !has_llseek && !nonseekable1 && !nonseekable2 && !seq @ identifier fops0.fops; @@ struct file_operations fops = { ... +.llseek = noop_llseek, /* no read or write fn */ }; ===== End semantic patch ===== Signed-off-by: Arnd Bergmann Cc: Julia Lawall Cc: Christoph Hellwig --- arch/x86/xen/debugfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 1304bcec8ee5..7c0fedd98ea0 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = { .open = u32_array_open, .release= xen_array_release, .read = u32_array_read, + .llseek = no_llseek, }; struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, -- cgit v1.2.1 From 23ace955c22cb9bdf703e4bdc9bf7379166113cd Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: xen: Don't disable the I/O space If a guest domain wants to access PCI devices through the frontend driver (coming later in the patch series), it will need access to the I/O space. [ Impact: Allow for domU IO access, preparing for pci passthrough ] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 328b00305426..c413132702f7 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -260,7 +260,5 @@ void __init xen_arch_setup(void) pm_idle = xen_idle; - paravirt_disable_iospace(); - fiddle_vdso(); } -- cgit v1.2.1 From b5401a96b59475c1c878439caecb8c521bdfd4ad Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Thu, 18 Mar 2010 16:31:34 -0400 Subject: xen/x86/PCI: Add support for the Xen PCI subsystem The frontend stub lives in arch/x86/pci/xen.c, alongside other sub-arch PCI init code (e.g. olpc.c). It provides a mechanism for Xen PCI frontend to setup/destroy legacy interrupts, MSI/MSI-X, and PCI configuration operations. [ Impact: add core of Xen PCI support ] [ v2: Removed the IOMMU code and only focusing on PCI.] [ v3: removed usage of pci_scan_all_fns as that does not exist] [ v4: introduced pci_xen value to fix compile warnings] [ v5: squished fixes+features in one patch, changed Reviewed-by to Ccs] [ v7: added Acked-by] Signed-off-by: Alex Nixon Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Stefano Stabellini Acked-by: Jesse Barnes Cc: "H. Peter Anvin" Cc: Matthew Wilcox Cc: Qing He Cc: Thomas Gleixner Cc: x86@kernel.org --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7d46c8441418..1ccfa1bf0f89 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -1220,6 +1221,8 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + if (pci_xen) + x86_init.pci.arch_init = pci_xen_init; } else { /* Make sure ACS will be enabled */ pci_request_acs(); -- cgit v1.2.1 From 74226b8c8a0b10841129916191205095af928da5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 19 Aug 2010 13:34:58 -0400 Subject: xen/pci: Request ACS when Xen-SWIOTLB is activated. It used to done in the Xen startup code but that is not really appropiate. [v2: Update Kconfig with PCI requirement] Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/pci-swiotlb-xen.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index a013ec9d0c54..be4d80a6fae9 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -1,6 +1,7 @@ /* Glue code to lib/swiotlb-xen.c */ #include +#include #include #include @@ -54,5 +55,8 @@ void __init pci_xen_swiotlb_init(void) if (xen_swiotlb) { xen_swiotlb_init(1); dma_ops = &xen_swiotlb_dma_ops; + + /* Make sure ACS will be enabled */ + pci_request_acs(); } } -- cgit v1.2.1 From 76fac077db6b34e2c6383a7b4f3f4f7b7d06d8ce Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Mon, 11 Oct 2010 14:37:08 -0700 Subject: x86, kexec: Make sure to stop all CPUs before exiting the kernel x86 smp_ops now has a new op, stop_other_cpus which takes a parameter "wait" this allows the caller to specify if it wants to stop until all the cpus have processed the stop IPI. This is required specifically for the kexec case where we should wait for all the cpus to be stopped before starting the new kernel. We now wait for the cpus to stop in all cases except for panic/kdump where we expect things to be broken and we are doing our best to make things work anyway. This patch fixes a legitimate regression, which was introduced during 2.6.30, by commit id 4ef702c10b5df18ab04921fc252c26421d4d6c75. Signed-off-by: Alok N Kataria LKML-Reference: <1286833028.1372.20.camel@ank32.eng.vmware.com> Cc: Eric W. Biederman Cc: Jeremy Fitzhardinge Cc: v2.6.30-36 Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/smp.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7d46c8441418..44f80861382f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1018,7 +1018,7 @@ static void xen_reboot(int reason) struct sched_shutdown r = { .reason = reason }; #ifdef CONFIG_SMP - smp_send_stop(); + stop_other_cpus(); #endif if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 25f232b18a82..f4d010031465 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -400,9 +400,9 @@ static void stop_self(void *v) BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -470,7 +470,7 @@ static const struct smp_ops xen_smp_ops __initdata = { .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, -- cgit v1.2.1 From a171ce6e7b4d967b9f9b8ba7c076a8a6d26e432b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 15:04:48 -0700 Subject: xen: dynamically allocate p2m space Use early brk mechanism to allocate p2m tables, to save memory when booting non-Xen. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406af..ecbdcf0d45d4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -174,18 +174,16 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES); /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, + (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); static inline unsigned p2m_top_index(unsigned long pfn) { @@ -209,7 +207,7 @@ void xen_build_mfn_list_list(void) p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { + for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -230,6 +228,22 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; + unsigned i; + + p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, + PAGE_SIZE); + for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) + p2m_missing[i] = ~0UL; + + p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES, + PAGE_SIZE); + for (i = 0; i < TOP_ENTRIES; i++) + p2m_top[i] = p2m_missing; + + p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE); + p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * + (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE), + PAGE_SIZE); for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); -- cgit v1.2.1 From a2e875298729540300a9a0324ee66e3b7883a912 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:08:31 -0700 Subject: xen: allocate p2m size based on actual max size Allocate p2m tables based on the actual runtime maximum pfn rather than the static config-time limit. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ecbdcf0d45d4..151813d97552 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -169,25 +169,27 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) +static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) +#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE) +#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES) /* Placeholder for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); /* Array of pointers to pages containing p2m entries */ -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, TOP_ENTRIES); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES); /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, TOP_ENTRIES); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, - (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); + (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_DOMAIN_PAGES); + BUG_ON(pfn >= max_p2m_pfn); return pfn / P2M_ENTRIES_PER_PAGE; } @@ -201,13 +203,15 @@ void xen_build_mfn_list_list(void) { unsigned pfn, idx; - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { + for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); } - for (idx = 0; idx < TOP_ENTRIES/P2M_ENTRIES_PER_PAGE; idx++) { + for (idx = 0; + idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; + idx++) { unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } @@ -230,19 +234,22 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned pfn; unsigned i; + max_p2m_pfn = max_pfn; + p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, PAGE_SIZE); for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p2m_missing[i] = ~0UL; - p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES, + p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), PAGE_SIZE); - for (i = 0; i < TOP_ENTRIES; i++) + for (i = 0; i < TOP_ENTRIES(max_pfn); i++) p2m_top[i] = p2m_missing; - p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES, PAGE_SIZE); + p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), + PAGE_SIZE); p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * - (TOP_ENTRIES / P2M_ENTRIES_PER_PAGE), + (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE), PAGE_SIZE); for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { @@ -258,7 +265,7 @@ unsigned long get_phys_to_machine(unsigned long pfn) { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + if (unlikely(pfn >= max_p2m_pfn)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); @@ -304,7 +311,7 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + if (unlikely(pfn >= max_p2m_pfn)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } -- cgit v1.2.1 From f0991802bb4368e33848e7f823caa487d23555fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:16:28 -0700 Subject: xen: use early_brk for level2_kernel_pgt Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 151813d97552..71c6af6c89a5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1843,13 +1843,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; +static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); -- cgit v1.2.1 From 764f0138b9f54aa96761810055a74fce1e58c300 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 16:23:51 -0700 Subject: xen: allocate level1_ident_pgt Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 71c6af6c89a5..3de42d1e475b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -138,7 +138,8 @@ static inline void check_zero(void) * large enough to allocate page table pages to allocate the rest. * Each page can map 2MB. */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ @@ -1718,6 +1719,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) unsigned ident_pte; unsigned long pfn; + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + ident_pte = 0; pfn = 0; for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { @@ -1728,7 +1732,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + if (ident_pte == LEVEL1_IDENT_ENTRIES) break; pte_page = &level1_ident_pgt[ident_pte]; -- cgit v1.2.1 From 1e17fc7eff56d23a835d5d33e71d813aa9eb8ecc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 3 Sep 2010 15:04:08 -0700 Subject: xen: remove noise about registering vcpu info Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7d46c8441418..ee304b52d8b7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -135,9 +135,6 @@ static void xen_vcpu_setup(int cpu) info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", - cpu, vcpup, info.mfn, info.offset); - /* Check to see if the hypervisor will put the vcpu_info structure where we want it, which allows direct access via a percpu-variable. */ @@ -151,9 +148,6 @@ static void xen_vcpu_setup(int cpu) /* This cpu is using the registered vcpu info, even if later ones fail to. */ per_cpu(xen_vcpu, cpu) = vcpup; - - printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", - cpu, vcpup); } } @@ -873,8 +867,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ if (have_vcpu_info_placement) { - printk(KERN_INFO "Xen: using vcpu_info placement\n"); - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); -- cgit v1.2.1 From b7eb4ad39134ee5b09634a710e50c2990f533231 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:06:58 -0700 Subject: xen: set shared_info->arch.max_pfn to max_p2m_pfn Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3de42d1e475b..909ad637c38d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; + HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ -- cgit v1.2.1 From 1f2d9dd309feb08fdbc711fa03841650dfff87d8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:11:35 -0700 Subject: xen: set the actual extent of the mfn_list_list Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 909ad637c38d..fcff8c829799 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -224,7 +224,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = max_p2m_mfn; + HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ -- cgit v1.2.1 From bbbf61eff92c7c236f57ee1953ad84055443717e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Aug 2010 17:12:17 -0700 Subject: xen: make install_p2mtop_page() static Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 ++-- arch/x86/xen/mmu.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index fcff8c829799..00969099b057 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -275,8 +275,8 @@ unsigned long get_phys_to_machine(unsigned long pfn) } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +/* install a new p2m_top page */ +static bool install_p2mtop_page(unsigned long pfn, unsigned long *p) { unsigned topidx = p2m_top_index(pfn); unsigned long **pfnp, *mfnp; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index fa938c4aa2f7..537bb9aab777 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -12,7 +12,6 @@ enum pt_level { bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -bool install_p2mtop_page(unsigned long pfn, unsigned long *p); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -- cgit v1.2.1 From 58e05027b530ff081ecea68e38de8d59db8f87e0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 13:28:48 -0700 Subject: xen: convert p2m to a 3 level tree Make the p2m structure a 3 level tree which covers the full possible physical space. The p2m structure contains mappings from the domain's pfns to system-wide mfns. The structure has 3 levels and two roots. The first root is for the domain's own use, and is linked with virtual addresses. The second is all mfn references, and is used by Xen on save/restore to allow it to update the p2m mapping for the domain. At boot, the domain builder provides a simple flat p2m array for all the initially present pages. We construct the two levels above that using the early_brk allocator. After early boot time, set_phys_to_machine() will allocate any missing levels using the normal kernel allocator (at GFP_KERNEL, so it must be called in a normal blocking context). Because the early_brk() API requires us to pre-reserve the maximum amount of memory we could allocate, there is still a CONFIG_XEN_MAX_DOMAIN_MEMORY config option, but its only negative side-effect is to increase the kernel's apparent bss size. However, since all unused brk memory is returned to the heap, there's no real downside to making it large. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/Kconfig | 11 +- arch/x86/xen/mmu.c | 318 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 246 insertions(+), 83 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 68128a1b401a..90a7f5ad6916 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -19,15 +19,12 @@ config XEN_PVHVM depends on X86_LOCAL_APIC config XEN_MAX_DOMAIN_MEMORY - int "Maximum allowed size of a domain in gigabytes" - default 8 if X86_32 - default 32 if X86_64 + int + default 128 depends on XEN help - The pseudo-physical to machine address array is sized - according to the maximum possible memory size of a Xen - domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here. + This only affects the sizing of some bss arrays, the unused + portions of which are freed. config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 00969099b057..d4c7265cf0a0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -170,51 +170,162 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) -static unsigned long max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES(pages) ((pages) / P2M_ENTRIES_PER_PAGE) -#define MAX_TOP_ENTRIES TOP_ENTRIES(MAX_DOMAIN_PAGES) +static unsigned long max_p2m_pfn __read_mostly; -/* Placeholder for holes in the address space */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_ENTRIES_PER_PAGE); +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - /* Array of pointers to pages containing p2m entries */ -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top, MAX_TOP_ENTRIES); +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) -/* Arrays of p2m arrays expressed in mfns used for save/restore */ -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, MAX_TOP_ENTRIES); +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn_list, - (MAX_TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)); +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= max_p2m_pfn); - return pfn / P2M_ENTRIES_PER_PAGE; + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } static inline unsigned p2m_index(unsigned long pfn) { - return pfn % P2M_ENTRIES_PER_PAGE; + return pfn % P2M_PER_PAGE; } -/* Build the parallel p2m_top_mfn structures */ +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ void xen_build_mfn_list_list(void) { - unsigned pfn, idx; + unsigned pfn, i; - for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_ENTRIES_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); } - for (idx = 0; - idx < TOP_ENTRIES(max_p2m_pfn)/P2M_ENTRIES_PER_PAGE; - idx++) { - unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; - p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long mid_mfn; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + + /* Don't bother allocating any mfn mid levels if + they're just missing */ + if (mid[mididx] == p2m_missing) + continue; + + mid_mfn = p2m_top_mfn[topidx]; + mid_mfn_p = mfn_to_virt(mid_mfn); + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + mid_mfn = virt_to_mfn(mid_mfn_p); + + p2m_top_mfn[topidx] = mid_mfn; + } + + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -223,7 +334,7 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn_list); + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; } @@ -233,99 +344,154 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - unsigned i; max_p2m_pfn = max_pfn; - p2m_missing = extend_brk(sizeof(*p2m_missing) * P2M_ENTRIES_PER_PAGE, - PAGE_SIZE); - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p2m_missing[i] = ~0UL; + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); - p2m_top = extend_brk(sizeof(*p2m_top) * TOP_ENTRIES(max_pfn), - PAGE_SIZE); - for (i = 0; i < TOP_ENTRIES(max_pfn); i++) - p2m_top[i] = p2m_missing; + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); - p2m_top_mfn = extend_brk(sizeof(*p2m_top_mfn) * TOP_ENTRIES(max_pfn), - PAGE_SIZE); - p2m_top_mfn_list = extend_brk(sizeof(*p2m_top_mfn_list) * - (TOP_ENTRIES(max_pfn) / P2M_ENTRIES_PER_PAGE), - PAGE_SIZE); + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); - p2m_top[topidx] = &mfn_list[pfn]; + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; } + /* Allocate and initialize top and mid mfn levels */ xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= max_p2m_pfn)) + if (unlikely(pfn >= MAX_P2M_PFN)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - return p2m_top[topidx][idx]; + + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -static bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +static void *alloc_p2m_page(void) { - unsigned topidx = p2m_top_index(pfn); - unsigned long **pfnp, *mfnp; - unsigned i; + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} - pfnp = &p2m_top[topidx]; - mfnp = &p2m_top_mfn[topidx]; +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p[i] = INVALID_P2M_ENTRY; +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; - if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { - *mfnp = virt_to_mfn(p); - return true; + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); } - return false; -} + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = mfn_to_virt(*top_mfn_p); -static void alloc_p2m(unsigned long pfn) -{ - unsigned long *p; + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + } - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); + p2m = alloc_p2m_page(); + if (!p2m) + return false; + + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= max_p2m_pfn)) { + if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - if (mfn == INVALID_P2M_ENTRY) - return true; - return false; - } - + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; return true; } @@ -338,7 +504,7 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - alloc_p2m(pfn); + WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn); if (!__set_phys_to_machine(pfn, mfn)) BUG(); -- cgit v1.2.1 From c3798062f100c3e1d4ae1241bc536f3b1f28a6ca Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 13:42:04 -0700 Subject: xen: add return value to set_phys_to_machine() set_phys_to_machine() can return false on failure, which means a memory allocation failure for the p2m structure. It can only fail if setting the mfn for a pfn in previously unused address space. It is guaranteed to succeed if you're setting a mapping to INVALID_P2M_ENTRY or updating the mfn for an existing pfn. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d4c7265cf0a0..b96513437236 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -282,7 +282,7 @@ static void p2m_init(unsigned long *p2m) */ void xen_build_mfn_list_list(void) { - unsigned pfn, i; + unsigned pfn; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { @@ -496,19 +496,22 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } -void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; + return true; } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - WARN(!alloc_p2m(pfn), "Can't allocate p2m for %lx, %lx", pfn, mfn); + if (!alloc_p2m(pfn)) + return false; if (!__set_phys_to_machine(pfn, mfn)) - BUG(); + return false; } + + return true; } unsigned long arbitrary_virt_to_mfn(void *vaddr) -- cgit v1.2.1 From 33a847502b0338351cebd8fc0c68ac796cfadbbd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Aug 2010 15:18:19 -0700 Subject: xen: defer building p2m mfn structures until kernel is mapped When building mfn parts of p2m structure, we rely on being able to use mfn_to_virt, which in turn requires kernel to be mapped into the linear area (which is distinct from the kernel image mapping on 64-bit). Defer calling xen_build_mfn_list_list() until after xen_setup_kernel_pagetable(); Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 3 +++ arch/x86/xen/mmu.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ee304b52d8b7..d8873014b5ed 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1178,6 +1178,9 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b96513437236..9b43bb398d37 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -374,9 +374,6 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top[topidx][mididx] = &mfn_list[pfn]; } - - /* Allocate and initialize top and mid mfn levels */ - xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) -- cgit v1.2.1 From cfd8951e082a589637f9de3c33efd3218fdb3c03 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 31 Aug 2010 14:06:22 -0700 Subject: xen: don't map missing memory When setting up a pte for a missing pfn (no matching mfn), just create an empty pte rather than a junk mapping. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9b43bb398d37..4c63b7f452dd 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -745,7 +745,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + unsigned long mfn = pfn_to_mfn(pfn); + + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } + + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; -- cgit v1.2.1 From 35ae11fd146384d222f3bb1f17eed1970cc92c36 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:09:48 -0800 Subject: xen: Use host-provided E820 map Rather than simply using a flat memory map from Xen, use its provided E820 map. This allows the domain builder to tell the domain to reserve space for more pages than those initially provided at domain-build time. It also allows the host to specify holes in the address space (for PCI-passthrough, for example). Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 328b00305426..dd2eb2a9303f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -107,13 +108,46 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, char * __init xen_memory_setup(void) { + static struct e820entry map[E820MAX] __initdata; + unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long long mem_end; + int rc; + struct xen_memory_map memmap; + int i; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + mem_end = PFN_PHYS(max_pfn); + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc == -ENOSYS) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = mem_end; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); e820.nr_map = 0; - - e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); + for (i = 0; i < memmap.nr_entries; i++) { + unsigned long long end = map[i].addr + map[i].size; + if (map[i].type == E820_RAM) { + if (map[i].addr > mem_end) + continue; + if (end > mem_end) { + /* Truncate region to max_mem. */ + map[i].size -= end - mem_end; + } + } + if (map[i].size > 0) + e820_add_region(map[i].addr, map[i].size, map[i].type); + } /* * Even though this is normal, usable memory under Xen, reserve -- cgit v1.2.1 From 42ee1471e9b879479a15debac752314a596c738e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 30 Aug 2010 16:41:02 -0700 Subject: xen: implement "extra" memory to reserve space for pages not present at boot When using the e820 map to get the initial pseudo-physical address space, look for either Xen-provided memory which doesn't lie within an E820 region, or an E820 RAM region which extends beyond the Xen-provided memory range. Count these pages, and add them to a new "extra memory" range. This range has an E820 RAM range to describe it - so the kernel will allocate page structures for it - but it is also marked reserved so that the kernel will not attempt to use it. The balloon driver can then add this range as a set of currently ballooned-out pages, which can be used to extend the domain beyond its original size. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dd2eb2a9303f..f9a99eaddcdc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -34,6 +34,26 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +/* Amount of extra memory space we add to the e820 ranges */ +phys_addr_t xen_extra_mem_start, xen_extra_mem_size; + +static __init void xen_add_extra_mem(unsigned long pages) +{ + u64 size = (u64)pages * PAGE_SIZE; + + if (!pages) + return; + + e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + reserve_early(xen_extra_mem_start + xen_extra_mem_size, + xen_extra_mem_start + xen_extra_mem_size + size, + "XEN EXTRA"); + + xen_extra_mem_size += size; +} + static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) { @@ -105,7 +125,6 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ - char * __init xen_memory_setup(void) { static struct e820entry map[E820MAX] __initdata; @@ -114,6 +133,7 @@ char * __init xen_memory_setup(void) unsigned long long mem_end; int rc; struct xen_memory_map memmap; + unsigned long extra_pages = 0; int i; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); @@ -135,6 +155,7 @@ char * __init xen_memory_setup(void) BUG_ON(rc); e820.nr_map = 0; + xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end = map[i].addr + map[i].size; if (map[i].type == E820_RAM) { @@ -143,6 +164,8 @@ char * __init xen_memory_setup(void) if (end > mem_end) { /* Truncate region to max_mem. */ map[i].size -= end - mem_end; + + extra_pages += PFN_DOWN(end - mem_end); } } if (map[i].size > 0) @@ -169,7 +192,9 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - xen_return_unused_memory(xen_start_info->nr_pages, &e820); + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + + xen_add_extra_mem(extra_pages); return "Xen"; } -- cgit v1.2.1 From 36bc251b87f88147e9d8346e4b431f42353c3d38 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Sep 2010 17:07:03 -0700 Subject: xen: make sure xen_extra_mem_start is beyond all non-RAM e820 If Xen gives us non-RAM E820 entries (dom0 only, typically), then make sure the extra RAM region is beyond them. It's OK for the extra space to grow into E820 regions, however. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f9a99eaddcdc..eac010008cd2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -167,7 +167,8 @@ char * __init xen_memory_setup(void) extra_pages += PFN_DOWN(end - mem_end); } - } + } else if (map[i].type != E820_RAM) + xen_extra_mem_start = end; if (map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } -- cgit v1.2.1 From b5b43ced7a6e79d30df3232b37dc82c5d8dfa843 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Sep 2010 17:10:12 -0700 Subject: xen: add extra pages for E820 RAM regions, even if beyond mem_end If an entire E820 RAM region is beyond mem_end, still add its pages to the extra area so that space can be used by the kernel. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index eac010008cd2..1e85e26efa69 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -158,9 +158,8 @@ char * __init xen_memory_setup(void) xen_extra_mem_start = mem_end; for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end = map[i].addr + map[i].size; + if (map[i].type == E820_RAM) { - if (map[i].addr > mem_end) - continue; if (end > mem_end) { /* Truncate region to max_mem. */ map[i].size -= end - mem_end; @@ -169,7 +168,9 @@ char * __init xen_memory_setup(void) } } else if (map[i].type != E820_RAM) xen_extra_mem_start = end; - if (map[i].size > 0) + + if ((map[i].type != E820_RAM || map[i].addr < mem_end) && + map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); } -- cgit v1.2.1 From 698bb8d14a5b577b6841acaccdf5095d3b7c7389 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Sep 2010 10:19:14 -0700 Subject: xen: limit extra memory to a certain ratio of base If extra memory is very much larger than the base memory size then all of the base memory can be filled with structures reserved to describe the extra memory, leaving no space for anything else. Even at the maximum ratio there will be little space for anything else, but this change is intended to at least allow the system to boot rather than crash mysteriously. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1e85e26efa69..6c9039e92f81 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -37,6 +37,18 @@ extern void xen_syscall32_target(void); /* Amount of extra memory space we add to the e820 ranges */ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + static __init void xen_add_extra_mem(unsigned long pages) { u64 size = (u64)pages * PAGE_SIZE; @@ -134,6 +146,7 @@ char * __init xen_memory_setup(void) int rc; struct xen_memory_map memmap; unsigned long extra_pages = 0; + unsigned long extra_limit; int i; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); @@ -196,6 +209,25 @@ char * __init xen_memory_setup(void) extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + max_pfn + extra_pages); + + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + xen_add_extra_mem(extra_pages); return "Xen"; -- cgit v1.2.1 From 2f7acb208523a3bf5f1830f01c29f7feda045169 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 15 Sep 2010 13:32:49 -0700 Subject: xen: make sure xen_max_p2m_pfn is up to date Keep xen_max_p2m_pfn up to date with the end of the extra memory we're adding. It is possible that it will be too high since memory may be truncated by a "mem=" option on the kernel command line, but that won't matter. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 8 ++++---- arch/x86/xen/setup.c | 2 ++ arch/x86/xen/xen-ops.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4c63b7f452dd..b2371671b11c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -195,7 +195,7 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ * 512 and 1024 entries respectively. */ -static unsigned long max_p2m_pfn __read_mostly; +unsigned long xen_max_p2m_pfn __read_mostly; #define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) @@ -293,7 +293,7 @@ void xen_build_mfn_list_list(void) p2m_top_mfn_init(p2m_top_mfn); } - for (pfn = 0; pfn < max_p2m_pfn; pfn += P2M_PER_PAGE) { + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); unsigned long **mid; @@ -335,7 +335,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn); - HYPERVISOR_shared_info->arch.max_pfn = max_p2m_pfn; + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -345,7 +345,7 @@ void __init xen_build_dynamic_phys_to_machine(void) unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; - max_p2m_pfn = max_pfn; + xen_max_p2m_pfn = max_pfn; p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_init(p2m_missing); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 6c9039e92f81..cad2fcd130ec 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -64,6 +64,8 @@ static __init void xen_add_extra_mem(unsigned long pages) "XEN EXTRA"); xen_extra_mem_size += size; + + xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7c8ab86163e9..d505e98e03f8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,6 +30,7 @@ void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); +extern unsigned long xen_max_p2m_pfn; char * __init xen_memory_setup(void); void __init xen_arch_setup(void); -- cgit v1.2.1 From 41f2e4771a4f1ba26c35438daf32917b9ef7858d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 30 Mar 2010 11:47:40 -0700 Subject: xen: add support for PAT Convert Linux PAT entries into Xen ones when constructing ptes. Linux doesn't use _PAGE_PAT for ptes, so the only difference in the first 4 entries is that Linux uses _PAGE_PWT for WC, whereas Xen (and default) use it for WT. xen_pte_val does the inverse conversion. We hard-code assumptions about Linux's current PAT layout, but a warning on the wrmsr to MSR_IA32_CR_PAT should point out any problems. If necessary we could go to a more general table-based conversion between Linux and Xen PAT entries. hugetlbfs poses a problem at the moment, the x86 architecture uses the same flag for _PAGE_PAT and _PAGE_PSE, which changes meaning depending on which pagetable level we're using. At the moment this should be OK so long as nobody tries to do a pte_val on a hugetlbfs pte. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 5 +++++ arch/x86/xen/mmu.c | 53 +++++++++++++++++++++++++++++++++++++++++++++--- arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 57 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d8873014b5ed..b860e576ed0c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -829,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ break; + case MSR_IA32_CR_PAT: + if (smp_processor_id() == 0) + xen_set_pat(((u64)high << 32) | low); + break; + default: ret = native_write_msr_safe(msr, low, high); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b2371671b11c..67b41017f7b8 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -780,10 +781,18 @@ static pteval_t iomap_pte(pteval_t val) pteval_t xen_pte_val(pte_t pte) { - if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) - return pte.pte; + pteval_t pteval = pte.pte; - return pte_mfn_to_pfn(pte.pte); + /* If this is a WC pte, convert back from Xen WC to Linux WC */ + if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { + WARN_ON(!pat_enabled); + pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; + } + + if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) + return pteval; + + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -793,10 +802,48 @@ pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); +/* + * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 + * are reserved for now, to correspond to the Intel-reserved PAT + * types. + * + * We expect Linux's PAT set as follows: + * + * Idx PTE flags Linux Xen Default + * 0 WB WB WB + * 1 PWT WC WT WT + * 2 PCD UC- UC- UC- + * 3 PCD PWT UC UC UC + * 4 PAT WB WC WB + * 5 PAT PWT WC WP WT + * 6 PAT PCD UC- UC UC- + * 7 PAT PCD PWT UC UC UC + */ + +void xen_set_pat(u64 pat) +{ + /* We expect Linux to use a PAT setting of + * UC UC- WC WB (ignoring the PAT flag) */ + WARN_ON(pat != 0x0007010600070106ull); +} + pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); + /* If Linux is trying to set a WC pte, then map to the Xen WC. + * If _PAGE_PAT is set, then it probably means it is really + * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope + * things work out OK... + * + * (We should never see kernel mappings with _PAGE_PSE set, + * but we could see hugetlbfs mappings, I think.). + */ + if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { + if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) + pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; + } + /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index d505e98e03f8..64044747348e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -32,6 +32,8 @@ void xen_ident_map_ISA(void); void xen_reserve_top(void); extern unsigned long xen_max_p2m_pfn; +void xen_set_pat(u64); + char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); -- cgit v1.2.1 From 3654581e47adc07072aebe239818485b68ea04f0 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 29 Sep 2010 16:54:33 -0700 Subject: xen: don't add extra_pages for RAM after mem_end If an E820 region is entirely beyond mem_end, don't attempt to truncate it and add the truncated pages to extra_pages, as they will be negative. Also, make sure the extra memory region starts after all BIOS provided E820 regions (and in the case of RAM regions, post-clipping). Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index cad2fcd130ec..7a4ab05cff8a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -52,20 +52,19 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size; static __init void xen_add_extra_mem(unsigned long pages) { u64 size = (u64)pages * PAGE_SIZE; + u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; if (!pages) return; - e820_add_region(xen_extra_mem_start + xen_extra_mem_size, size, E820_RAM); + e820_add_region(extra_start, size, E820_RAM); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - reserve_early(xen_extra_mem_start + xen_extra_mem_size, - xen_extra_mem_start + xen_extra_mem_size + size, - "XEN EXTRA"); + reserve_early(extra_start, extra_start + size, "XEN EXTRA"); xen_extra_mem_size += size; - xen_max_p2m_pfn = PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size); + xen_max_p2m_pfn = PFN_DOWN(extra_start + size); } static unsigned long __init xen_release_chunk(phys_addr_t start_addr, @@ -175,15 +174,21 @@ char * __init xen_memory_setup(void) unsigned long long end = map[i].addr + map[i].size; if (map[i].type == E820_RAM) { - if (end > mem_end) { + if (map[i].addr < mem_end && end > mem_end) { /* Truncate region to max_mem. */ - map[i].size -= end - mem_end; + u64 delta = end - mem_end; - extra_pages += PFN_DOWN(end - mem_end); + map[i].size -= delta; + extra_pages += PFN_DOWN(delta); + + end = mem_end; } - } else if (map[i].type != E820_RAM) + } + + if (end > xen_extra_mem_start) xen_extra_mem_start = end; + /* If region is non-RAM or below mem_end, add what remains */ if ((map[i].type != E820_RAM || map[i].addr < mem_end) && map[i].size > 0) e820_add_region(map[i].addr, map[i].size, map[i].type); -- cgit v1.2.1 From 375b2a9ada6d105483aab22f1af1d727bc3c418d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 21 Oct 2010 11:00:46 +0100 Subject: xen: correctly rebuild mfn list list after migration. Otherwise the second migration attempt fails because the mfn_list_list still refers to all the old mfns. We need to update the entires in both p2m_top_mfn and the mid_mfn pages which p2m_top_mfn refers to. In order to do this we need to keep track of the virtual addresses mapping the p2m_mid_mfn pages since we cannot rely on mfn_to_virt(p2m_top_mfn[idx]) since p2m_top_mfn[idx] will still contain the old MFN after a migration, which may now belong to another domain and hence have a different mapping in the m2p. Therefore add and maintain a third top level page, p2m_top_mfn_p[], which tracks the virtual addresses of the mfns contained in p2m_top_mfn[]. We also need to update the content of the p2m_mid_missing_mfn page on resume to refer to the page's new mfn. p2m_missing does not need updating since the migration process takes care of the leaf p2m pages for us. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67b41017f7b8..e41683cf290a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -187,6 +187,8 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ * / \ / \ / / * p2m p2m p2m p2m p2m p2m p2m ... * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the * maximum representable pseudo-physical address space is: * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages @@ -211,6 +213,7 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); @@ -247,6 +250,14 @@ static void p2m_top_mfn_init(unsigned long *top) top[i] = virt_to_mfn(p2m_mid_missing_mfn); } +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + static void p2m_mid_init(unsigned long **mid) { unsigned i; @@ -283,33 +294,43 @@ static void p2m_init(unsigned long *p2m) */ void xen_build_mfn_list_list(void) { - unsigned pfn; + unsigned long pfn; /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn); + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); unsigned mididx = p2m_mid_index(pfn); unsigned long **mid; - unsigned long mid_mfn; unsigned long *mid_mfn_p; mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; /* Don't bother allocating any mfn mid levels if - they're just missing */ - if (mid[mididx] == p2m_missing) + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; continue; - - mid_mfn = p2m_top_mfn[topidx]; - mid_mfn_p = mfn_to_virt(mid_mfn); + } if (mid_mfn_p == p2m_mid_missing_mfn) { /* @@ -321,11 +342,10 @@ void xen_build_mfn_list_list(void) mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p); - mid_mfn = virt_to_mfn(mid_mfn_p); - - p2m_top_mfn[topidx] = mid_mfn; + p2m_top_mfn_p[topidx] = mid_mfn_p; } + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -344,7 +364,7 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned pfn; + unsigned long pfn; xen_max_p2m_pfn = max_pfn; @@ -434,7 +454,9 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = mfn_to_virt(*top_mfn_p); + mid_mfn = p2m_top_mfn_p[topidx]; + + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); if (mid_mfn == p2m_mid_missing_mfn) { /* Separately check the mid mfn level */ @@ -446,11 +468,13 @@ static bool alloc_p2m(unsigned long pfn) return false; p2m_mid_mfn_init(mid_mfn); - + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; } if (p2m_top[topidx][mididx] == p2m_missing) { -- cgit v1.2.1 From 9e9a5fcb04e3af077d1be32710298b852210d93f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 2 Sep 2010 16:16:00 +0100 Subject: xen: use host E820 map for dom0 When running as initial domain, get the real physical memory map from xen using the XENMEM_machine_memory_map hypercall and use it to setup the e820 regions. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/setup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 7a4ab05cff8a..0ce9d58cb29d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -149,6 +149,7 @@ char * __init xen_memory_setup(void) unsigned long extra_pages = 0; unsigned long extra_limit; int i; + int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); mem_end = PFN_PHYS(max_pfn); @@ -156,7 +157,10 @@ char * __init xen_memory_setup(void) memmap.nr_entries = E820MAX; set_xen_guest_handle(memmap.buffer, map); - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + op = xen_initial_domain() ? + XENMEM_machine_memory_map : + XENMEM_memory_map; + rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { memmap.nr_entries = 1; map[0].addr = 0ULL; @@ -235,7 +239,8 @@ char * __init xen_memory_setup(void) else extra_pages = 0; - xen_add_extra_mem(extra_pages); + if (!xen_initial_domain()) + xen_add_extra_mem(extra_pages); return "Xen"; } -- cgit v1.2.1 From 6b0661a5e6fbfb159b78a39c0476905aa9b575fe Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 2 Sep 2010 15:47:32 +0100 Subject: xen: introduce XEN_DOM0 as a silent option Add XEN_DOM0 to arch/x86/xen/Kconfig as a silent compile time option that gets enabled when xen and basic x86, acpi and pci support are selected. Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 68128a1b401a..a234b9a71ab4 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -13,6 +13,16 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. +config XEN_DOM0 + def_bool y + depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI + +# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST +# name in tools. +config XEN_PRIVILEGED_GUEST + def_bool XEN_DOM0 + config XEN_PVHVM def_bool y depends on XEN -- cgit v1.2.1 From 98511f3532eb7fce274f37d94f29790922799e15 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 3 Sep 2010 14:55:16 +0100 Subject: xen: map a dummy page for local apic and ioapic in xen_set_fixmap Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 42086ac406af..ffc5e24a53ba 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1861,6 +1861,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, } #endif /* CONFIG_X86_64 */ +static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; + static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { pte_t pte; @@ -1880,9 +1882,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) # endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: -#endif -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -1890,6 +1889,22 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) pte = pfn_pte(phys, prot); break; +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + +#ifdef CONFIG_X86_IO_APIC + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: + /* + * We just don't map the IO APIC - all access is via + * hypercalls. Keep the address in the pte for reference. + */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + case FIX_PARAVIRT_BOOTMAP: /* This is an MFN, but it isn't an IO mapping from the IO domain */ @@ -2027,6 +2042,8 @@ void __init xen_init_mmu_ops(void) pv_mmu_ops = xen_mmu_ops; vmap_lazy_unmap = false; + + memset(dummy_mapping, 0xff, PAGE_SIZE); } /* Protected by xen_reservation_lock. */ -- cgit v1.2.1 From 801fd14a725ef7757d33f07b83415cdd2165e50a Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 23 Sep 2010 12:06:25 +0100 Subject: xen: use vcpu_ops to setup cpu masks Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 25f232b18a82..138676781dd4 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -156,11 +156,16 @@ static void __init xen_fill_possible_map(void) { int i, rc; + num_processors = 0; + disabled_cpus = 0; for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; set_cpu_possible(i, true); + } else { + set_cpu_possible(i, false); + set_cpu_present(i, false); } } } @@ -190,6 +195,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (xen_smp_intr_init(0)) BUG(); + xen_fill_possible_map(); + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) panic("could not allocate xen_cpu_initialized_map\n"); @@ -480,6 +487,5 @@ static const struct smp_ops xen_smp_ops __initdata = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; - xen_fill_possible_map(); xen_init_spinlocks(); } -- cgit v1.2.1 From 4ec5387cc36c6472a2ff2c82e9865abe8cab96c2 Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Thu, 2 Sep 2010 15:45:43 +0100 Subject: xen: add the direct mapping area for ISA bus access add the direct mapping area for ISA bus access when running as initial domain Signed-off-by: Juan Quintela Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 24 ++++++++++++++++++++++++ arch/x86/xen/setup.c | 3 +++ 3 files changed, 28 insertions(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1ccfa1bf0f89..9efb00446250 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1186,6 +1186,7 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_ident_map_ISA(); init_mm.pgd = pgd; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ffc5e24a53ba..eed9c7cee4b7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1682,6 +1682,7 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } +/* Set the page permissions on an identity-mapped pages */ static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; @@ -1929,6 +1930,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } +__init void xen_ident_map_ISA(void) +{ + unsigned long pa; + + /* + * If we're dom0, then linear map the ISA machine addresses into + * the kernel's address space. + */ + if (!xen_initial_domain()) + return; + + xen_raw_printk("Xen: setup ISA identity maps\n"); + + for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { + pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); + + if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) + BUG(); + } + + xen_flush_tlb(); +} + static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index c413132702f7..62ceb7864017 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -119,6 +119,9 @@ char * __init xen_memory_setup(void) * Even though this is normal, usable memory under Xen, reserve * ISA memory anyway because too many things think they can poke * about in there. + * + * In a dom0 kernel, this region is identity mapped with the + * hardware ISA area, so it really is out of bounds. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); -- cgit v1.2.1 From ff12849a7a187e17fcbd888b39850d22103395c6 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 28 Sep 2010 16:45:51 +0100 Subject: xen: mask the MTRR feature from the cpuid We don't want Linux to think that the cpu supports MTRRs when running under Xen because MTRR operations could only be performed through hypercalls. Signed-off-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9efb00446250..d48a32b10a3c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -243,6 +243,7 @@ static __init void xen_init_cpuid_mask(void) cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ (1 << X86_FEATURE_MCA) | /* disable MCA */ + (1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) -- cgit v1.2.1 From 45263cb0993de738e158c625c84a5feb18bed317 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 25 Oct 2010 16:32:29 -0700 Subject: xen: include xen/xen.h for definition of xen_initial_domain() CC arch/x86/xen/setup.o arch/x86/xen/setup.c: In function 'xen_memory_setup': arch/x86/xen/setup.c:161: error: implicit declaration of function 'xen_initial_domain' Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0ce9d58cb29d..8e2c9f21fa37 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include -- cgit v1.2.1 From ea5b8f73933e34d2b47a65284c46d26d49e7edb9 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 26 Oct 2010 17:28:33 +0100 Subject: xen: initialize cpu masks for pv guests in xen_smp_init Pv guests don't have ACPI and need the cpu masks to be set correctly as early as possible so we call xen_fill_possible_map from xen_smp_init. On the other hand the initial domain supports ACPI so in this case we skip xen_fill_possible_map and rely on it. However Xen might limit the number of cpus usable by the domain, so we filter those masks during smp initialization using the VCPUOP_is_up hypercall. It is important that the filtering is done before xen_setup_vcpu_info_placement. Signed-off-by: Stefano Stabellini --- arch/x86/xen/smp.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 138676781dd4..834dfeb54e31 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -156,6 +157,25 @@ static void __init xen_fill_possible_map(void) { int i, rc; + if (xen_initial_domain()) + return; + + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } + } +} + +static void __init xen_filter_cpu_maps(void) +{ + int i, rc; + + if (!xen_initial_domain()) + return; + num_processors = 0; disabled_cpus = 0; for (i = 0; i < nr_cpu_ids; i++) { @@ -179,6 +199,7 @@ static void __init xen_smp_prepare_boot_cpu(void) old memory can be recycled */ make_lowmem_page_readwrite(xen_initial_gdt); + xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); } @@ -195,8 +216,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) if (xen_smp_intr_init(0)) BUG(); - xen_fill_possible_map(); - if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) panic("could not allocate xen_cpu_initialized_map\n"); @@ -487,5 +506,6 @@ static const struct smp_ops xen_smp_ops __initdata = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; + xen_fill_possible_map(); xen_init_spinlocks(); } -- cgit v1.2.1 From 61d8e11e519ee7912ab59610fba1aaf08e3c1d84 Mon Sep 17 00:00:00 2001 From: Zimny Lech Date: Wed, 27 Oct 2010 15:34:53 -0700 Subject: Remove duplicate includes from many files Signed-off-by: Zimny Lech Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/xen/enlighten.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 44ab12dc2a12..0cd12db0b142 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include -- cgit v1.2.1 From a2d771c036eb8c040683089ca04c36dfb93a0e60 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 29 Oct 2010 16:56:19 +0100 Subject: xen: correct size of level2_kernel_pgt sizeof(pmd_t *) is 4 bytes on 32-bit PAE leading to an allocation of only 2048 bytes. The correct size is sizeof(pmd_t) giving us a full page allocation. Signed-off-by: Ian Campbell Cc: Jeremy Fitzhardinge Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c237b810b03f..21ed8d7f75a5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2126,7 +2126,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, { pmd_t *kernel_pmd; - level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); + level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + -- cgit v1.2.1 From 9ec23a7f6d2537faf14368e066e307c06812c4ca Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 28 Oct 2010 11:32:29 -0700 Subject: xen: do not release any memory under 1M in domain 0 We already deliberately setup a 1-1 P2M for the region up to 1M in order to allow code which assumes this region is already mapped to work without having to convert everything to ioremap. Domain 0 should not return any apparently unused memory regions (reserved or otherwise) in this region to Xen since the e820 may not accurately reflect what the BIOS has stashed in this region. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/setup.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86/xen') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b1dbdaa23ecc..769c4b01fa32 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -118,16 +118,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, const struct e820map *e820) { phys_addr_t max_addr = PFN_PHYS(max_pfn); - phys_addr_t last_end = 0; + phys_addr_t last_end = ISA_END_ADDRESS; unsigned long released = 0; int i; + /* Free any unused memory above the low 1Mbyte. */ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { phys_addr_t end = e820->map[i].addr; end = min(max_addr, end); - released += xen_release_chunk(last_end, end); - last_end = e820->map[i].addr + e820->map[i].size; + if (last_end < end) + released += xen_release_chunk(last_end, end); + last_end = max(last_end, e820->map[i].addr + e820->map[i].size); } if (last_end < max_addr) @@ -164,6 +166,7 @@ char * __init xen_memory_setup(void) XENMEM_memory_map; rc = HYPERVISOR_memory_op(op, &memmap); if (rc == -ENOSYS) { + BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; map[0].addr = 0ULL; map[0].size = mem_end; @@ -201,12 +204,13 @@ char * __init xen_memory_setup(void) } /* - * Even though this is normal, usable memory under Xen, reserve - * ISA memory anyway because too many things think they can poke + * In domU, the ISA region is normal, usable memory, but we + * reserve ISA memory anyway because too many things poke * about in there. * - * In a dom0 kernel, this region is identity mapped with the - * hardware ISA area, so it really is out of bounds. + * In Dom0, the host E820 information can leave gaps in the + * ISA range, which would cause us to release those pages. To + * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); -- cgit v1.2.1