diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/x86_init.h | 12 | ||||
-rw-r--r-- | arch/x86/kernel/x86_init.c | 4 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 24 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 138 |
5 files changed, 54 insertions, 125 deletions
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7db7723d1f32..d56187c6b838 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -299,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +extern void native_pagetable_reserve(u64 start, u64 end); #ifdef CONFIG_X86_32 extern void native_pagetable_setup_start(pgd_t *base); extern void native_pagetable_setup_done(pgd_t *base); diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 643ebf2e2ad8..d3d859035af9 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -68,6 +68,17 @@ struct x86_init_oem { }; /** + * struct x86_init_mapping - platform specific initial kernel pagetable setup + * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage + * + * For more details on the purpose of this hook, look in + * init_memory_mapping and the commit that added it. + */ +struct x86_init_mapping { + void (*pagetable_reserve)(u64 start, u64 end); +}; + +/** * struct x86_init_paging - platform specific paging functions * @pagetable_setup_start: platform specific pre paging_init() call * @pagetable_setup_done: platform specific post paging_init() call @@ -123,6 +134,7 @@ struct x86_init_ops { struct x86_init_mpparse mpparse; struct x86_init_irqs irqs; struct x86_init_oem oem; + struct x86_init_mapping mapping; struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c11514e9128b..75ef4b18e9b7 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = { .banner = default_banner, }, + .mapping = { + .pagetable_reserve = native_pagetable_reserve, + }, + .paging = { .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 286d289b039b..37b8b0fe8320 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -81,6 +81,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse, end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); } +void __init native_pagetable_reserve(u64 start, u64 end) +{ + memblock_x86_reserve_range(start, end, "PGTABLE"); +} + struct map_range { unsigned long start; unsigned long end; @@ -272,9 +277,24 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, __flush_tlb_all(); + /* + * Reserve the kernel pagetable pages we used (pgt_buf_start - + * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) + * so that they can be reused for other purposes. + * + * On native it just means calling memblock_x86_reserve_range, on Xen it + * also means marking RW the pagetable pages that we allocated before + * but that haven't been used. + * + * In fact on xen we mark RO the whole range pgt_buf_start - + * pgt_buf_top, because we have to make sure that when + * init_memory_mapping reaches the pagetable pages area, it maps + * RO all the pagetable pages, including the ones that are beyond + * pgt_buf_end at that time. + */ if (!after_bootmem && pgt_buf_end > pgt_buf_start) - memblock_x86_reserve_range(pgt_buf_start << PAGE_SHIFT, - pgt_buf_end << PAGE_SHIFT, "PGTABLE"); + x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), + PFN_PHYS(pgt_buf_end)); if (!after_bootmem) early_memtest(start, end); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 55c965b38c27..0684f3c74d53 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1275,6 +1275,20 @@ static __init void xen_pagetable_setup_start(pgd_t *base) { } +static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) +{ + /* reserve the range used */ + native_pagetable_reserve(start, end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end, + PFN_PHYS(pgt_buf_top)); + while (end < PFN_PHYS(pgt_buf_top)) { + make_lowmem_page_readwrite(__va(end)); + end += PAGE_SIZE; + } +} + static void xen_post_allocator_init(void); static __init void xen_pagetable_setup_done(pgd_t *base) @@ -1463,119 +1477,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } -#ifdef CONFIG_X86_64 -static __initdata u64 __last_pgt_set_rw = 0; -static __initdata u64 __pgt_buf_start = 0; -static __initdata u64 __pgt_buf_end = 0; -static __initdata u64 __pgt_buf_top = 0; -/* - * As a consequence of the commit: - * - * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e - * Author: Yinghai Lu <yinghai@kernel.org> - * Date: Fri Dec 17 16:58:28 2010 -0800 - * - * x86-64, mm: Put early page table high - * - * at some point init_memory_mapping is going to reach the pagetable pages - * area and map those pages too (mapping them as normal memory that falls - * in the range of addresses passed to init_memory_mapping as argument). - * Some of those pages are already pagetable pages (they are in the range - * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and - * everything is fine. - * Some of these pages are not pagetable pages yet (they fall in the range - * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they - * are going to be mapped RW. When these pages become pagetable pages and - * are hooked into the pagetable, xen will find that the guest has already - * a RW mapping of them somewhere and fail the operation. - * The reason Xen requires pagetables to be RO is that the hypervisor needs - * to verify that the pagetables are valid before using them. The validation - * operations are called "pinning". - * - * In order to fix the issue we mark all the pages in the entire range - * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation - * is completed only the range pgt_buf_start-pgt_buf_end is reserved by - * init_memory_mapping. Hence the kernel is going to crash as soon as one - * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those - * ranges are RO). - * - * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ - * the init_memory_mapping has completed (in a perfect world we would - * call this function from init_memory_mapping, but lets ignore that). - * - * Because we are called _after_ init_memory_mapping the pgt_buf_[start, - * end,top] have all changed to new values (b/c init_memory_mapping - * is called and setting up another new page-table). Hence, the first time - * we enter this function, we save away the pgt_buf_start value and update - * the pgt_buf_[end,top]. - * - * When we detect that the "old" pgt_buf_start through pgt_buf_end - * PFNs have been reserved (so memblock_x86_reserve_range has been called), - * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. - * - * And then we update those "old" pgt_buf_[end|top] with the new ones - * so that we can redo this on the next pagetable. - */ -static __init void mark_rw_past_pgt(void) { - - if (pgt_buf_end > pgt_buf_start) { - u64 addr, size; - - /* Save it away. */ - if (!__pgt_buf_start) { - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - /* If we get the range that starts at __pgt_buf_end that means - * the range is reserved, and that in 'init_memory_mapping' - * the 'memblock_x86_reserve_range' has been called with the - * outdated __pgt_buf_start, __pgt_buf_end (the "new" - * pgt_buf_[start|end|top] refer now to a new pagetable. - * Note: we are called _after_ the pgt_buf_[..] have been - * updated.*/ - - addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), - &size, PAGE_SIZE); - - /* Still not reserved, meaning 'memblock_x86_reserve_range' - * hasn't been called yet. Update the _end and _top.*/ - if (addr == PFN_PHYS(__pgt_buf_start)) { - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - return; - } - - /* OK, the area is reserved, meaning it is time for us to - * set RW for the old end->top PFNs. */ - - /* ..unless we had already done this. */ - if (__pgt_buf_end == __last_pgt_set_rw) - return; - - addr = PFN_PHYS(__pgt_buf_end); - - /* set as RW the rest */ - printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", - PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); - - while (addr < PFN_PHYS(__pgt_buf_top)) { - make_lowmem_page_readwrite(__va(addr)); - addr += PAGE_SIZE; - } - /* And update everything so that we are ready for the next - * pagetable (the one created for regions past 4GB) */ - __last_pgt_set_rw = __pgt_buf_end; - __pgt_buf_start = pgt_buf_start; - __pgt_buf_end = pgt_buf_end; - __pgt_buf_top = pgt_buf_top; - } - return; -} -#else -static __init void mark_rw_past_pgt(void) { } -#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1602,14 +1503,6 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) unsigned long pfn = pte_pfn(pte); /* - * A bit of optimization. We do not need to call the workaround - * when xen_set_pte_init is called with a PTE with 0 as PFN. - * That is b/c the pagetable at that point are just being populated - * with empty values and we can save some cycles by not calling - * the 'memblock' code.*/ - if (pfn) - mark_rw_past_pgt(); - /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an * early_ioremap fixmap slot as a freshly allocated page, make sure @@ -2118,8 +2011,6 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { - mark_rw_past_pgt(); - #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif @@ -2228,6 +2119,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { void __init xen_init_mmu_ops(void) { + x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; |