diff options
48 files changed, 2039 insertions, 608 deletions
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20371d0635e4..0ae1e77eae50 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -321,6 +321,7 @@ ENTRY(ia32_syscall) /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP + PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS /* * No need to follow this irqs on/off section: the syscall diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2d..aa89387006fe 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,6 +18,8 @@ #include <asm/ia32.h> #include <asm/bootparam.h> +#include <xen/interface/xen.h> + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_64_UNISTD_H_ @@ -131,5 +133,14 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7c36fb8a28d4..d1692b2a41ff 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSCALL32); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb40..736f50fa433d 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -16,6 +16,7 @@ #include <asm/i387.h> #include <asm/msr.h> #include <asm/io.h> +#include <asm/linkage.h> #include <asm/mmu_context.h> #include <asm/mtrr.h> #include <asm/mce.h> @@ -316,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); @@ -517,8 +515,7 @@ void pda_init(int cpu) } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c340..80d5663db3bc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1189,6 +1189,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1199,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1213,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1257,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -1312,3 +1318,103 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c97819829146..1b318e903bf6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + x86_64_init_pda(); early_printk("Kernel really alive\n"); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217cb..db3280afe886 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -407,6 +407,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f12478..1cf8c1fcc088 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,11 +83,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..2963ab5d91ee 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include <asm/desc.h> #include <asm/setup.h> #include <asm/arch_hooks.h> +#include <asm/pgtable.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> @@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9a..e8a8e1b99817 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; @@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); @@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81a..c9010f82141d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -824,7 +824,10 @@ void __init setup_arch(char **cmdline_p) vmi_init(); #endif + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e8..1deb3b624a79 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9689a5138e64..7113acd8ac45 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -868,8 +868,6 @@ void __init paging_init(void) */ sparse_init(); zone_sizes_init(); - - paravirt_post_allocator_init(); } /* diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b7ad9f89d21f..4d6ef0a336d6 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE # Build multiple 32-bit vDSO images to choose from at boot time. # obj-$(VDSO32-y) += vdso32-syms.lds -vdso32.so-$(CONFIG_X86_32) += int80 +vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 0bce5429a515..513f330c5832 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } -/* - * These symbols are defined by vdso32.S to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vdso32_default_start, vdso32_default_end; -extern const char vdso32_sysenter_start, vdso32_sysenter_end; static struct page *vdso32_pages[1]; #ifdef CONFIG_X86_64 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) /* May not be __init: called during resume */ void syscall32_cpu_init(void) @@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map) #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) void enable_sep_cpu(void) { @@ -296,12 +292,15 @@ int __init sysenter_setup(void) gate_vma_init(); #endif - if (!vdso32_sysenter()) { - vsyscall = &vdso32_default_start; - vsyscall_len = &vdso32_default_end - &vdso32_default_start; - } else { + if (vdso32_syscall()) { + vsyscall = &vdso32_syscall_start; + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + } else if (vdso32_sysenter()){ vsyscall = &vdso32_sysenter_start; vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + } else { + vsyscall = &vdso32_int80_start; + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; } memcpy(syscall_page, vsyscall, vsyscall_len); diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index 1e36f72cab86..2ce5f82c333b 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -2,14 +2,17 @@ __INITDATA - .globl vdso32_default_start, vdso32_default_end -vdso32_default_start: -#ifdef CONFIG_X86_32 + .globl vdso32_int80_start, vdso32_int80_end +vdso32_int80_start: .incbin "arch/x86/vdso/vdso32-int80.so" -#else +vdso32_int80_end: + + .globl vdso32_syscall_start, vdso32_syscall_end +vdso32_syscall_start: +#ifdef CONFIG_COMPAT .incbin "arch/x86/vdso/vdso32-syscall.so" #endif -vdso32_default_end: +vdso32_syscall_end: .globl vdso32_sysenter_start, vdso32_sysenter_end vdso32_sysenter_start: diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c2cc99580871..3815e425f470 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,8 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK - depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) + depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) + depends on X86_CMPXCHG && X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the @@ -15,10 +15,16 @@ config XEN config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" - default 8 + default 8 if X86_32 + default 32 if X86_64 depends on XEN help The pseudo-physical to machine address array is sized according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here.
\ No newline at end of file + need to be too stingy here. + +config XEN_SAVE_RESTORE + bool + depends on PM + default y
\ No newline at end of file diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 2ba2d1649131..59c1e539aed2 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o suspend.o + time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb508456ef52..3da6acb7eafc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,7 @@ #include <xen/interface/sched.h> #include <xen/features.h> #include <xen/page.h> +#include <xen/hvc-console.h> #include <asm/paravirt.h> #include <asm/page.h> @@ -40,12 +41,12 @@ #include <asm/xen/hypervisor.h> #include <asm/fixmap.h> #include <asm/processor.h> +#include <asm/msr-index.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/pgalloc.h> #include "xen-ops.h" #include "mmu.h" @@ -57,6 +58,18 @@ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); /* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* * Note about cr3 (pagetable base) values: * * xen_cr3 contains the current logical cr3 value; it contains the @@ -363,14 +376,6 @@ static void load_TLS_descriptor(struct thread_struct *t, static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - /* * XXX sleazy hack: If we're being called in a lazy-cpu zone, * it means we're in a context switch, and %gs has just been @@ -379,10 +384,39 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * Either way, it has been saved, and the new value will get * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 loadsegment(gs, 0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); +} + +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); } +#endif static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) @@ -400,23 +434,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static int cvt_gate_to_trap(int vector, u32 low, u32 high, +static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - u8 type, dpl; - - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) + if (val->type != 0xf && val->type != 0xe) return 0; info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; + info->address = gate_offset(*val); + info->cs = gate_segment(*val); + info->flags = val->dpl; /* interrupt gates clear IF */ - if (type == 0xe) + if (val->type == 0xe) info->flags |= 4; return 1; @@ -443,11 +472,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) if (p >= start && (p + 8) <= end) { struct trap_info info[2]; - u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) + if (cvt_gate_to_trap(entrynum, g, &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -460,13 +488,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, { unsigned in, out, count; - count = (desc->size+1) / 8; + count = (desc->size+1) / sizeof(gate_desc); BUG_ON(count > 256); for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); + gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out])) out++; } traps[out].address = 0; @@ -695,33 +723,89 @@ static void set_current_cr3(void *v) x86_write_percpu(xen_current_cr3, (unsigned long)v); } -static void xen_write_cr3(unsigned long cr3) +static void __xen_write_cr3(bool kernel, unsigned long cr3) { struct mmuext_op *op; struct multicall_space mcs; - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + unsigned long mfn; - BUG_ON(preemptible()); + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; - mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + WARN_ON(mfn == 0 && kernel); - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - x86_write_percpu(xen_cr3, cr3); + mcs = __xen_mc_entry(sizeof(*op)); op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; op->arg1.mfn = mfn; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - /* Update xen_update_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); + if (kernel) { + x86_write_percpu(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + x86_write_percpu(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch(msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EFAULT; + break; +#endif + default: + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) @@ -778,6 +862,48 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) xen_alloc_ptpage(mm, pfn, PT_PMD); } +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + /* This should never happen until we're OK to use struct page */ static void xen_release_ptpage(u32 pfn, unsigned level) { @@ -803,6 +929,18 @@ static void xen_release_pmd(u32 pfn) xen_release_ptpage(pfn, PT_PMD); } +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -841,68 +979,16 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; - int i; - - /* special set_pte for pagetable initialization */ - pv_mmu_ops.set_pte = xen_set_pte_init; - - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. This - * is a stand-in while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); - - make_lowmem_page_readonly(pmd); - - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); - - /* Unpin initial Xen pagetable */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(xen_start_info->pt_base))); } void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(addr, - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); - - HYPERVISOR_shared_info = (struct shared_info *)addr; + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); @@ -917,26 +1003,32 @@ void xen_setup_shared_info(void) static __init void xen_pagetable_setup_done(pgd_t *base) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; - pv_mmu_ops.set_pte = xen_set_pte; - xen_setup_shared_info(); - - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); } static __init void xen_post_allocator_init(void) { + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_mark_init_mm_pinned(); } @@ -950,6 +1042,7 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ +#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -959,6 +1052,7 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } +#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -979,10 +1073,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { +#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); +#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1025,8 +1121,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_F00F_BUG case FIX_F00F_IDT: #endif +#ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif @@ -1039,6 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) } __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif } static const struct pv_info xen_info __initdata = { @@ -1084,18 +1196,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .iret = xen_iret, .irq_enable_sysexit = xen_sysexit, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = xen_sysret32, + .usergs_sysret64 = xen_sysret64, +#endif .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif .store_gdt = native_store_gdt, .store_idt = native_store_idt, @@ -1109,14 +1228,34 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + .lazy_mode = { .enter = paravirt_enter_lazy_cpu, .leave = xen_leave_lazy, }, }; +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, + .init_IRQ = __xen_init_IRQ, .save_fl = xen_save_fl, .restore_fl = xen_restore_fl, .irq_disable = xen_irq_disable, @@ -1124,7 +1263,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { .safe_halt = xen_safe_halt, .halt = xen_halt, #ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, + .adjust_exception_frame = xen_adjust_exception_frame, #endif }; @@ -1157,8 +1296,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pgd_alloc = __paravirt_pgd_alloc, - .pgd_free = paravirt_nop, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, @@ -1170,7 +1309,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif - .set_pte = NULL, /* see xen_pagetable_setup_* */ +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else + .set_pte = xen_set_pte_init, +#endif .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, @@ -1184,15 +1327,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, +#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, @@ -1205,21 +1359,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; -#endif /* CONFIG_SMP */ - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1264,6 +1403,7 @@ static const struct machine_ops __initdata xen_machine_ops = { static void __init xen_reserve_top(void) { +#ifdef CONFIG_X86_32 unsigned long top = HYPERVISOR_VIRT_START; struct xen_platform_parameters pp; @@ -1271,7 +1411,247 @@ static void __init xen_reserve_top(void) top = pp.virt_start; reserve_top_address(-top + 2 * PAGE_SIZE); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +#ifdef CONFIG_X86_64 +static void walk(pgd_t *pgd, unsigned long addr) +{ + unsigned l4idx = pgd_index(addr); + unsigned l3idx = pud_index(addr); + unsigned l2idx = pmd_index(addr); + unsigned l1idx = pte_index(addr); + pgd_t l4; + pud_t l3; + pmd_t l2; + pte_t l1; + + xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", + pgd, addr, l4idx, l3idx, l2idx, l1idx); + + l4 = pgd[l4idx]; + xen_raw_printk(" l4: %016lx\n", l4.pgd); + xen_raw_printk(" %016lx\n", pgd_val(l4)); + + l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; + xen_raw_printk(" l3: %016lx\n", l3.pud); + xen_raw_printk(" %016lx\n", pud_val(l3)); + + l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; + xen_raw_printk(" l2: %016lx\n", l2.pmd); + xen_raw_printk(" %016lx\n", pmd_val(l2)); + + l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; + xen_raw_printk(" l1: %016lx\n", l1.pte); + xen_raw_printk(" %016lx\n", pte_val(l1)); +} +#endif + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", + addr, pfn, get_phys_to_machine(pfn), + pgprot_val(prot), pte.pte); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Set up identity map */ + xen_map_identity_early(level2_ident_pgt, max_pfn); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); + + return pgd; +} +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; } +#endif /* CONFIG_X86_64 */ /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) @@ -1301,53 +1681,56 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; +#ifdef CONFIG_X86_64 + /* Disable until direct per-cpu data access. */ + have_vcpu_info_placement = 0; + x86_64_init_pda(); #endif + xen_smp_init(); + /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) xen_build_dynamic_phys_to_machine(); pgd = (pgd_t *)xen_start_info->pt_base; - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; - - init_mm.pgd = pgd; /* use the Xen pagetables to start */ - - /* keep using Xen gdt for now; no urgent need to change it */ - - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + + init_mm.pgd = pgd; + + /* keep using Xen gdt for now; no urgent need to change it */ + pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - /* set the limit of our address space */ xen_reserve_top(); +#ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); new_cpu_data.hard_math = 1; new_cpu_data.x86_capability[0] = cpuid_edx(1); +#endif /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; boot_params.hdr.ramdisk_image = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); if (!is_initial_xendomain()) { add_preferred_console("xenboot", 0, NULL); @@ -1355,6 +1738,21 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); } + xen_raw_console_write("about to get started...\n"); + +#if 0 + xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", + &boot_params, __pa_symbol(&boot_params), + __va(__pa_symbol(&boot_params))); + + walk(pgd, &boot_params); + walk(pgd, __va(__pa(&boot_params))); +#endif + /* Start the world */ +#ifdef CONFIG_X86_32 i386_start_kernel(); +#else + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); +#endif } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ff0aa74afaa1..a44d56e38bd1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -44,8 +44,10 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> +#include <asm/fixmap.h> #include <asm/mmu_context.h> #include <asm/paravirt.h> +#include <asm/linkage.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -56,26 +58,29 @@ #include "multicalls.h" #include "mmu.h" +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + + #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] - __attribute__((section(".data.page_aligned"))) = +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] - __attribute__((section(".data.page_aligned"))) = +static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; -static unsigned long p2m_top_mfn_list[ - PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __page_aligned_bss; static inline unsigned p2m_top_index(unsigned long pfn) { @@ -181,15 +186,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } -xmaddr_t arbitrary_virt_to_machine(unsigned long address) +xmaddr_t arbitrary_virt_to_machine(void *vaddr) { + unsigned long address = (unsigned long)vaddr; unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & ~PAGE_MASK; BUG_ON(pte == NULL); - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); + return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } void make_lowmem_page_readonly(void *vaddr) @@ -256,7 +262,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); extend_mmu_update(&u); @@ -283,35 +290,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) */ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* <mfn,flags> stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, @@ -418,7 +397,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); extend_mmu_update(&u); @@ -441,14 +421,19 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { +#ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; +#else + *ptep = pte; +#endif } +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((u64 *)ptep, pte_val_ma(pte)); + set_64bit((u64 *)ptep, native_pte_val(pte)); } void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -462,6 +447,7 @@ void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } +#endif /* CONFIG_X86_PAE */ pmd_t xen_make_pmd(pmdval_t pmd) { @@ -469,78 +455,189 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} + +pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} + +pgd_t *xen_get_user_pgd(pgd_t *pgd) +{ + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; + + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } + + return user_ptr; +} + +static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pgd_val_ma(val); + extend_mmu_update(&u); +} + +/* + * Raw hypercall-based set_pgd, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pgd(pgd_t *ptr, pgd_t val) +{ + pgd_t *user_ptr = xen_get_user_pgd(ptr); + + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + if (user_ptr) { + WARN_ON(page_pinned(user_ptr)); + *user_ptr = val; + } + return; + } + + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + if (user_ptr) + __xen_set_pgd_hyper(user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); +} +#endif /* PAGETABLE_LEVELS == 4 */ + /* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), unsigned long limit) { - pgd_t *pgd = pgd_base; int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; + unsigned hole_low, hole_high; + unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; + unsigned pgdidx, pudidx, pmdidx; - BUG_ON(limit > FIXADDR_TOP); + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(USER_LIMIT); + hole_high = pgd_index(PAGE_OFFSET); + + pgdidx_limit = pgd_index(limit); +#if PTRS_PER_PUD > 1 + pudidx_limit = pud_index(limit); +#else + pudidx_limit = 0; +#endif +#if PTRS_PER_PMD > 1 + pmdidx_limit = pmd_index(limit); +#else + pmdidx_limit = 0; +#endif + + flush |= (*func)(virt_to_page(pgd), PT_PGD); + + for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; - unsigned long pud_limit, pud_next; - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + if (pgdidx >= hole_low && pgdidx < hole_high) + continue; - if (!pgd_val(*pgd)) + if (!pgd_val(pgd[pgdidx])) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ flush |= (*func)(virt_to_page(pud), PT_PUD); - for (; addr != pud_limit; pud++, addr = pud_next) { + for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; - unsigned long pmd_limit; - pud_next = pud_addr_end(addr, pud_limit); - - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; + if (pgdidx == pgdidx_limit && + pudidx > pudidx_limit) + goto out; - if (pud_none(*pud)) + if (pud_none(pud[pudidx])) continue; - pmd = pmd_offset(pud, 0); + pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ flush |= (*func)(virt_to_page(pmd), PT_PMD); - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { + struct page *pte; + + if (pgdidx == pgdidx_limit && + pudidx == pudidx_limit && + pmdidx > pmdidx_limit) + goto out; - if (pmd_none(*pmd)) + if (pmd_none(pmd[pmdidx])) continue; - flush |= (*func)(pmd_page(*pmd), PT_PTE); + pte = pmd_page(pmd[pmdidx]); + flush |= (*func)(pte, PT_PTE); } } } - - flush |= (*func)(virt_to_page(pgd_base), PT_PGD); +out: return flush; } @@ -622,14 +719,31 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + if (pgd_walk(pgd, pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); xen_mc_batch(); } +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + pin_page(virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ xen_mc_issue(0); } @@ -656,9 +770,11 @@ void xen_mm_pin_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); @@ -708,7 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - pgd_walk(pgd, unpin_page, TASK_SIZE); +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + unpin_page(virt_to_page(user_pgd), PT_PGD); + } + } +#endif + +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif + + pgd_walk(pgd, unpin_page, USER_LIMIT); xen_mc_issue(0); } @@ -727,7 +859,6 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - printk("unpinning pinned %p\n", page_address(page)); xen_pgd_unpin((pgd_t *)page_address(page)); ClearPageSavePinned(page); } @@ -757,8 +888,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static void drop_other_mm_ref(void *info) { struct mm_struct *mm = info; + struct mm_struct *active_mm; + +#ifdef CONFIG_X86_64 + active_mm = read_pda(active_mm); +#else + active_mm = __get_cpu_var(cpu_tlbstate).active_mm; +#endif - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) + if (active_mm == mm) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 297bf9f5b8bc..0f59bd03f9e3 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -10,18 +10,6 @@ enum pt_level { PT_PTE }; -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); @@ -44,13 +32,26 @@ pgd_t xen_make_pgd(pgdval_t); void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); + +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); +#endif /* CONFIG_X86_PAE */ + void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud_hyper(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); + +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud); +pud_t xen_make_pud(pudval_t pudval); +void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); +void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); +#endif + +pgd_t *xen_get_user_pgd(pgd_t *pgd); pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 3c63c4da7ed1..9efd1c6c9776 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -76,6 +76,7 @@ void xen_mc_flush(void) if (ret) { printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", ret, smp_processor_id()); + dump_stack(); for (i = 0; i < b->mcidx; i++) { printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0a39595bde3..b6acc3a0af46 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,30 +83,72 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. + * We only need to bother in pure 32-bit mode; compat 32-bit processes + * can have un-truncated segments, so wrapping around is allowed. */ static void __init fiddle_vdso(void) { - extern const char vdso32_default_start; - u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); +#ifdef CONFIG_X86_32 + u32 *mask; + mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; + mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +#endif } -void xen_enable_sysenter(void) +static __cpuinit int register_callback(unsigned type, const void *func) { - int cpu = smp_processor_id(); - extern void xen_sysenter_target(void); - /* Mask events on entry, even though they get enabled immediately */ - static struct callback_register sysenter = { - .type = CALLBACKTYPE_sysenter, - .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + struct callback_register callback = { + .type = type, + .address = XEN_CALLBACK(__KERNEL_CS, func), .flags = CALLBACKF_mask_events, }; - if (!boot_cpu_has(X86_FEATURE_SEP) || - HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); + return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); +} + +void __cpuinit xen_enable_sysenter(void) +{ + extern void xen_sysenter_target(void); + int ret; + unsigned sysenter_feature; + +#ifdef CONFIG_X86_32 + sysenter_feature = X86_FEATURE_SEP; +#else + sysenter_feature = X86_FEATURE_SYSENTER32; +#endif + + if (!boot_cpu_has(sysenter_feature)) + return; + + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + if(ret != 0) + setup_clear_cpu_cap(sysenter_feature); +} + +void __cpuinit xen_enable_syscall(void) +{ +#ifdef CONFIG_X86_64 + int ret; + extern void xen_syscall_target(void); + extern void xen_syscall32_target(void); + + ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + if (ret != 0) { + printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); + /* Pretty fatal; 64-bit userspace has no other + mechanism for syscalls. */ } + + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { + ret = register_callback(CALLBACKTYPE_syscall32, + xen_syscall32_target); + if (ret != 0) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + } +#endif /* CONFIG_X86_64 */ } void __init xen_arch_setup(void) @@ -120,10 +162,12 @@ void __init xen_arch_setup(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); + if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) + BUG(); xen_enable_sysenter(); + xen_enable_syscall(); set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); @@ -143,11 +187,6 @@ void __init xen_arch_setup(void) pm_idle = xen_idle; -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - paravirt_disable_iospace(); fiddle_vdso(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 233156f39b7f..f702199312a5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -66,13 +66,22 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + preempt_disable(); + xen_enable_sysenter(); + xen_enable_syscall(); - preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); xen_setup_cpu_clockevents(); + cpu_set(cpu, cpu_online_map); + x86_write_percpu(cpu_state, CPU_ONLINE); + wmb(); + /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -141,56 +150,37 @@ static int xen_smp_intr_init(unsigned int cpu) return rc; } -void __init xen_fill_possible_map(void) +static void __init xen_fill_possible_map(void) { int i, rc; for (i = 0; i < NR_CPUS; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) + if (rc >= 0) { + num_processors++; cpu_set(i, cpu_possible_map); + } } } -void __init xen_smp_prepare_boot_cpu(void) +static void __init xen_smp_prepare_boot_cpu(void) { - int cpu; - BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); - - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_map lives in a per cpu area that is cleared - * when the per cpu array is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } + make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } -void __init xen_smp_prepare_cpus(unsigned int max_cpus) +static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_ map will be zeroed when the per - * cpu area is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } - smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) @@ -225,7 +215,7 @@ static __cpuinit int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + struct desc_struct *gdt; if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) return 0; @@ -234,12 +224,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; + gdt = get_cpu_gdt_table(cpu); + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; ctxt->user_regs.ss = __KERNEL_DS; +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; +#endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ @@ -249,11 +242,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt); - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); @@ -261,9 +254,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->kernel_ss = __KERNEL_DS; ctxt->kernel_sp = idle->thread.sp0; +#ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_cs = __KERNEL_CS; +#endif + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); @@ -276,7 +271,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu) { struct task_struct *idle = idle_task(cpu); int rc; @@ -287,11 +282,28 @@ int __cpuinit xen_cpu_up(unsigned int cpu) return rc; #endif +#ifdef CONFIG_X86_64 + /* Allocate node local memory for AP pdas */ + WARN_ON(cpu == 0); + if (cpu > 0) { + rc = get_local_pda(cpu); + if (rc) + return rc; + } +#endif + +#ifdef CONFIG_X86_32 init_gdt(cpu); per_cpu(current_task, cpu) = idle; irq_ctx_init(cpu); +#else + cpu_pda(cpu)->pcurrent = idle; + clear_tsk_thread_flag(idle, TIF_FORK); +#endif xen_setup_timer(cpu); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -306,20 +318,18 @@ int __cpuinit xen_cpu_up(unsigned int cpu) if (rc) return rc; - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); + while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + barrier(); + } + return 0; } -void xen_smp_cpus_done(unsigned int max_cpus) +static void xen_smp_cpus_done(unsigned int max_cpus) { } @@ -335,12 +345,12 @@ static void stop_self(void *v) BUG(); } -void xen_smp_send_stop(void) +static void xen_smp_send_stop(void) { smp_call_function(stop_self, NULL, 0); } -void xen_smp_send_reschedule(int cpu) +static void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -355,7 +365,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) xen_send_IPI_one(cpu, vector); } -void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(cpumask_t mask) { int cpu; @@ -370,7 +380,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask) } } -void xen_smp_send_call_function_single_ipi(int cpu) +static void xen_smp_send_call_function_single_ipi(int cpu) { xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } @@ -379,7 +389,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -389,8 +403,31 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; } + +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; + xen_fill_possible_map(); +} diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 251669a932d4..2a234db5949b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); - xen_timer_resume(); } } +void xen_arch_resume(void) +{ + /* nothing */ +} diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41de..2497a30f41de 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm_32.S diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S new file mode 100644 index 000000000000..4038cbfe3331 --- /dev/null +++ b/arch/x86/xen/xen-asm_64.S @@ -0,0 +1,271 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include <linux/linkage.h> + +#include <asm/asm-offsets.h> +#include <asm/processor-flags.h> +#include <asm/errno.h> +#include <asm/segment.h> + +#include <xen/interface/xen.h> + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#if 0 +#include <asm/percpu.h> + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + ret +#endif + +ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp),%rcx + mov 8+8(%rsp),%r11 + ret $16 + +hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 +/* + Xen64 iret frame: + + ss + rsp + rflags + cs + rip <-- standard iret frame + + flags + + rcx } + r11 }<-- pushed by hypercall page +rsp -> rax } + */ +ENTRY(xen_iret) + pushq $0 +1: jmp hypercall_iret +ENDPATCH(xen_iret) +RELOC(xen_iret, 1b+1) + +/* + sysexit is not used for 64-bit processes, so it's + only ever used to return to 32-bit compat userspace. + */ +ENTRY(xen_sysexit) + pushq $__USER32_DS + pushq %rcx + pushq $X86_EFLAGS_IF + pushq $__USER32_CS + pushq %rdx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysexit) +RELOC(xen_sysexit, 1b+1) + +ENTRY(xen_sysret64) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + + pushq $__USER_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret64) +RELOC(xen_sysret64, 1b+1) + +ENTRY(xen_sysret32) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack, %rsp + + pushq $__USER32_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER32_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret32) +RELOC(xen_sysret32, 1b+1) + +/* + Xen handles syscall callbacks much like ordinary exceptions, + which means we have: + - kernel gs + - kernel rsp + - an iret-like stack frame on the stack (including rcx and r11): + ss + rsp + rflags + cs + rip + r11 + rsp-> rcx + + In all the entrypoints, we undo all that to make it look + like a CPU-generated syscall/sysenter and jump to the normal + entrypoint. + */ + +.macro undo_xen_syscall + mov 0*8(%rsp),%rcx + mov 1*8(%rsp),%r11 + mov 5*8(%rsp),%rsp +.endm + +/* Normal 64-bit system call target */ +ENTRY(xen_syscall_target) + undo_xen_syscall + jmp system_call_after_swapgs +ENDPROC(xen_syscall_target) + +#ifdef CONFIG_IA32_EMULATION + +/* 32-bit compat syscall target */ +ENTRY(xen_syscall32_target) + undo_xen_syscall + jmp ia32_cstar_target +ENDPROC(xen_syscall32_target) + +/* 32-bit compat sysenter target */ +ENTRY(xen_sysenter_target) + undo_xen_syscall + jmp ia32_sysenter_target +ENDPROC(xen_sysenter_target) + +#else /* !CONFIG_IA32_EMULATION */ + +ENTRY(xen_syscall32_target) +ENTRY(xen_sysenter_target) + lea 16(%rsp), %rsp /* strip %rcx,%r11 */ + mov $-ENOSYS, %rax + pushq $VGCF_in_syscall + jmp hypercall_iret +ENDPROC(xen_syscall32_target) +ENDPROC(xen_sysenter_target) + +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7c0cf6320a0a..63d49a523ed3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -5,15 +5,24 @@ #include <linux/elfnote.h> #include <linux/init.h> + #include <asm/boot.h> +#include <asm/asm.h> +#include <asm/page.h> + #include <xen/interface/elfnote.h> #include <asm/xen/interface.h> __INIT ENTRY(startup_xen) - movl %esi,xen_start_info cld - movl $(init_thread_union+THREAD_SIZE),%esp +#ifdef CONFIG_X86_32 + mov %esi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%esp +#else + mov %rsi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%rsp +#endif jmp xen_start_kernel __FINIT @@ -21,21 +30,26 @@ ENTRY(startup_xen) .pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) - .skip 0x1000 + .skip PAGE_SIZE_asm .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) +#ifdef CONFIG_X86_32 + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) +#endif + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 6f4b1045c1c2..dd3c23152a2e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -26,6 +26,7 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_enable_syscall(void); void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); @@ -37,7 +38,6 @@ void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); @@ -45,20 +45,15 @@ bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); -void __init xen_fill_possible_map(void); - void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(cpumask_t mask); -void xen_smp_send_call_function_single_ipi(int cpu); +#ifdef CONFIG_SMP +void xen_smp_init(void); extern cpumask_t xen_cpu_initialized_map; +#else +static inline void xen_smp_init(void) {} +#endif /* Declare an asm function, along with symbols needed to make it @@ -73,7 +68,11 @@ DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); +/* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_sysret32(void); +void xen_sysret64(void); +void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index ef671d1a3bf0..902bbe788215 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -92,7 +92,7 @@ struct netfront_info { */ union skb_entry { struct sk_buff *skb; - unsigned link; + unsigned long link; } tx_skbs[NET_TX_RING_SIZE]; grant_ref_t gref_tx_head; grant_ref_t grant_tx_ref[NET_TX_RING_SIZE]; @@ -125,6 +125,17 @@ struct netfront_rx_info { struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; }; +static void skb_entry_set_link(union skb_entry *list, unsigned short id) +{ + list->link = id; +} + +static int skb_entry_is_link(const union skb_entry *list) +{ + BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link)); + return ((unsigned long)list->skb < PAGE_OFFSET); +} + /* * Access macros for acquiring freeing slots in tx_skbs[]. */ @@ -132,7 +143,7 @@ struct netfront_rx_info { static void add_id_to_freelist(unsigned *head, union skb_entry *list, unsigned short id) { - list[id].link = *head; + skb_entry_set_link(&list[id], *head); *head = id; } @@ -993,7 +1004,7 @@ static void xennet_release_tx_bufs(struct netfront_info *np) for (i = 0; i < NET_TX_RING_SIZE; i++) { /* Skip over entries which are actually freelist references */ - if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET) + if (skb_entry_is_link(&np->tx_skbs[i])) continue; skb = np->tx_skbs[i].skb; @@ -1123,7 +1134,7 @@ static struct net_device * __devinit xennet_create_dev(struct xenbus_device *dev /* Initialise tx_skbs as a free chain containing every entry. */ np->tx_skb_freelist = 0; for (i = 0; i < NET_TX_RING_SIZE; i++) { - np->tx_skbs[i].link = i+1; + skb_entry_set_link(&np->tx_skbs[i], i+1); np->grant_tx_ref[i] = GRANT_INVALID_REF; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 5b546e365f00..a5bc91ae6ff6 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -63,11 +63,12 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); - device_power_up(); + device_power_up(PMSG_RESUME); if (!*cancelled) { xen_irq_resume(); xen_console_resume(); + xen_timer_resume(); } return 0; @@ -107,12 +108,13 @@ static void do_suspend(void) goto out; } - if (!cancelled) + if (!cancelled) { + xen_arch_resume(); xenbus_resume(); - else + } else xenbus_suspend_cancel(); - device_resume(); + device_resume(PMSG_RESUME); /* Make sure timer events get retriggered on all CPUs */ clock_was_set(); diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h index ef5e8ec6a6ab..eef8095a09dc 100644 --- a/include/asm-x86/paravirt.h +++ b/include/asm-x86/paravirt.h @@ -1396,8 +1396,8 @@ extern struct paravirt_patch_site __parainstructions[], * caller saved registers but the argument parameter */ #define PV_SAVE_REGS "pushq %%rdi;" #define PV_RESTORE_REGS "popq %%rdi;" -#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx" -#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx" +#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi" +#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi" #define PV_FLAGS_ARG "D" #endif @@ -1489,8 +1489,26 @@ static inline unsigned long __raw_local_irq_save(void) #ifdef CONFIG_X86_64 -#define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx -#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax +#define PV_SAVE_REGS \ + push %rax; \ + push %rcx; \ + push %rdx; \ + push %rsi; \ + push %rdi; \ + push %r8; \ + push %r9; \ + push %r10; \ + push %r11 +#define PV_RESTORE_REGS \ + pop %r11; \ + pop %r10; \ + pop %r9; \ + pop %r8; \ + pop %rdi; \ + pop %rsi; \ + pop %rdx; \ + pop %rcx; \ + pop %rax #define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) #define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h index 912a3a17b9db..4e91ee1e37aa 100644 --- a/include/asm-x86/percpu.h +++ b/include/asm-x86/percpu.h @@ -22,6 +22,32 @@ DECLARE_PER_CPU(struct x8664_pda, pda); +/* + * These are supposed to be implemented as a single instruction which + * operates on the per-cpu data base segment. x86-64 doesn't have + * that yet, so this is a fairly inefficient workaround for the + * meantime. The single instruction is atomic with respect to + * preemption and interrupts, so we need to explicitly disable + * interrupts here to achieve the same effect. However, because it + * can be used from within interrupt-disable/enable, we can't actually + * disable interrupts; disabling preemption is enough. + */ +#define x86_read_percpu(var) \ + ({ \ + typeof(per_cpu_var(var)) __tmp; \ + preempt_disable(); \ + __tmp = __get_cpu_var(var); \ + preempt_enable(); \ + __tmp; \ + }) + +#define x86_write_percpu(var, val) \ + do { \ + preempt_disable(); \ + __get_cpu_var(var) = (val); \ + preempt_enable(); \ + } while(0) + #else /* CONFIG_X86_64 */ #ifdef __ASSEMBLY__ diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index 49cbd76b9547..96aa76e691d8 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -302,6 +302,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, /* Install a pte for a particular vaddr in kernel space. */ void set_pte_vaddr(unsigned long vaddr, pte_t pte); +#ifdef CONFIG_X86_32 +extern void native_pagetable_setup_start(pgd_t *base); +extern void native_pagetable_setup_done(pgd_t *base); +#else +static inline void native_pagetable_setup_start(pgd_t *base) {} +static inline void native_pagetable_setup_done(pgd_t *base) {} +#endif + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else /* !CONFIG_PARAVIRT */ @@ -333,6 +341,16 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte); #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) + +static inline void __init paravirt_pagetable_setup_start(pgd_t *base) +{ + native_pagetable_setup_start(base); +} + +static inline void __init paravirt_pagetable_setup_done(pgd_t *base) +{ + native_pagetable_setup_done(base); +} #endif /* CONFIG_PARAVIRT */ #endif /* __ASSEMBLY__ */ diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index ec871c420d7e..0611abf96a5e 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -171,21 +171,6 @@ do { \ */ #define update_mmu_cache(vma, address, pte) do { } while (0) -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); - -#ifndef CONFIG_PARAVIRT -static inline void __init paravirt_pagetable_setup_start(pgd_t *base) -{ - native_pagetable_setup_start(base); -} - -static inline void __init paravirt_pagetable_setup_done(pgd_t *base) -{ - native_pagetable_setup_done(base); -} -#endif /* !CONFIG_PARAVIRT */ - #endif /* !__ASSEMBLY__ */ /* diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index fa7208b483ca..805d3128bfc4 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -16,6 +16,8 @@ extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; +extern pmd_t level2_fixmap_pgt[512]; +extern pmd_t level2_ident_pgt[512]; extern pgd_t init_level4_pgt[]; #define swapper_pg_dir init_level4_pgt diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h index 90ab2225e71b..659492624e74 100644 --- a/include/asm-x86/setup.h +++ b/include/asm-x86/setup.h @@ -76,6 +76,7 @@ extern unsigned long init_pg_tables_start; extern unsigned long init_pg_tables_end; #else +void __init x86_64_init_pda(void); void __init x86_64_start_kernel(char *real_mode); void __init x86_64_start_reservations(char *real_mode_data); diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h index c2784b3e0b77..3c877f74f279 100644 --- a/include/asm-x86/smp.h +++ b/include/asm-x86/smp.h @@ -25,6 +25,8 @@ extern cpumask_t cpu_callin_map; extern void (*mtrr_hook)(void); extern void zap_low_mappings(void); +extern int __cpuinit get_local_pda(int cpu); + extern int smp_num_siblings; extern unsigned int num_processors; extern cpumask_t cpu_initialized; diff --git a/include/asm-x86/vdso.h b/include/asm-x86/vdso.h index 86e085e003d2..8e18fb80f5e6 100644 --- a/include/asm-x86/vdso.h +++ b/include/asm-x86/vdso.h @@ -36,4 +36,12 @@ extern const char VDSO32_PRELINK[]; extern void __user __kernel_sigreturn; extern void __user __kernel_rt_sigreturn; +/* + * These symbols are defined by vdso32.S to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vdso32_int80_start, vdso32_int80_end; +extern const char vdso32_syscall_start, vdso32_syscall_end; +extern const char vdso32_sysenter_start, vdso32_sysenter_end; + #endif /* asm-x86/vdso.h */ diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h index 2a4f9b41d684..91cb7fd5c123 100644 --- a/include/asm-x86/xen/hypercall.h +++ b/include/asm-x86/xen/hypercall.h @@ -40,83 +40,157 @@ #include <xen/interface/sched.h> #include <xen/interface/physdev.h> +/* + * The hypercall asms have to meet several constraints: + * - Work on 32- and 64-bit. + * The two architectures put their arguments in different sets of + * registers. + * + * - Work around asm syntax quirks + * It isn't possible to specify one of the rNN registers in a + * constraint, so we use explicit register variables to get the + * args into the right place. + * + * - Mark all registers as potentially clobbered + * Even unused parameters can be clobbered by the hypervisor, so we + * need to make sure gcc knows it. + * + * - Avoid compiler bugs. + * This is the tricky part. Because x86_32 has such a constrained + * register set, gcc versions below 4.3 have trouble generating + * code when all the arg registers and memory are trashed by the + * asm. There are syntactically simpler ways of achieving the + * semantics below, but they cause the compiler to crash. + * + * The only combination I found which works is: + * - assign the __argX variables first + * - list all actually used parameters as "+r" (__argX) + * - clobber the rest + * + * The result certainly isn't pretty, and it really shows up cpp's + * weakness as as macro language. Sorry. (But let's just give thanks + * there aren't more than 5 arguments...) + */ + extern struct { char _entry[32]; } hypercall_page[]; +#define __HYPERCALL "call hypercall_page+%c[offset]" +#define __HYPERCALL_ENTRY(x) \ + [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0])) + +#ifdef CONFIG_X86_32 +#define __HYPERCALL_RETREG "eax" +#define __HYPERCALL_ARG1REG "ebx" +#define __HYPERCALL_ARG2REG "ecx" +#define __HYPERCALL_ARG3REG "edx" +#define __HYPERCALL_ARG4REG "esi" +#define __HYPERCALL_ARG5REG "edi" +#else +#define __HYPERCALL_RETREG "rax" +#define __HYPERCALL_ARG1REG "rdi" +#define __HYPERCALL_ARG2REG "rsi" +#define __HYPERCALL_ARG3REG "rdx" +#define __HYPERCALL_ARG4REG "r10" +#define __HYPERCALL_ARG5REG "r8" +#endif + +#define __HYPERCALL_DECLS \ + register unsigned long __res asm(__HYPERCALL_RETREG); \ + register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \ + register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ + register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ + register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; + +#define __HYPERCALL_0PARAM "=r" (__res) +#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) +#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) +#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) +#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4) +#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5) + +#define __HYPERCALL_0ARG() +#define __HYPERCALL_1ARG(a1) \ + __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1); +#define __HYPERCALL_2ARG(a1,a2) \ + __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2); +#define __HYPERCALL_3ARG(a1,a2,a3) \ + __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3); +#define __HYPERCALL_4ARG(a1,a2,a3,a4) \ + __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4); +#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \ + __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5); + +#define __HYPERCALL_CLOBBER5 "memory" +#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG +#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG +#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG +#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG +#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG + #define _hypercall0(type, name) \ ({ \ - long __res; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res) \ - : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_0ARG(); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_0PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER0); \ (type)__res; \ }) #define _hypercall1(type, name, a1) \ ({ \ - long __res, __ign1; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res), "=b" (__ign1) \ - : "1" ((long)(a1)), \ - [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_1ARG(a1); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_1PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER1); \ (type)__res; \ }) #define _hypercall2(type, name, a1, a2) \ ({ \ - long __res, __ign1, __ign2; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ - : "1" ((long)(a1)), "2" ((long)(a2)), \ - [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_2ARG(a1, a2); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_2PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER2); \ (type)__res; \ }) #define _hypercall3(type, name, a1, a2, a3) \ ({ \ - long __res, __ign1, __ign2, __ign3; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ - "=d" (__ign3) \ - : "1" ((long)(a1)), "2" ((long)(a2)), \ - "3" ((long)(a3)), \ - [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_3ARG(a1, a2, a3); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_3PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER3); \ (type)__res; \ }) #define _hypercall4(type, name, a1, a2, a3, a4) \ ({ \ - long __res, __ign1, __ign2, __ign3, __ign4; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ - "=d" (__ign3), "=S" (__ign4) \ - : "1" ((long)(a1)), "2" ((long)(a2)), \ - "3" ((long)(a3)), "4" ((long)(a4)), \ - [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_4ARG(a1, a2, a3, a4); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_4PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER4); \ (type)__res; \ }) #define _hypercall5(type, name, a1, a2, a3, a4, a5) \ ({ \ - long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ - asm volatile ( \ - "call %[call]" \ - : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ - "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ - : "1" ((long)(a1)), "2" ((long)(a2)), \ - "3" ((long)(a3)), "4" ((long)(a4)), \ - "5" ((long)(a5)), \ - [call] "m" (hypercall_page[__HYPERVISOR_##name]) \ - : "memory" ); \ + __HYPERCALL_DECLS; \ + __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \ + asm volatile (__HYPERCALL \ + : __HYPERCALL_5PARAM \ + : __HYPERCALL_ENTRY(name) \ + : __HYPERCALL_CLOBBER5); \ (type)__res; \ }) @@ -152,6 +226,7 @@ HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) return _hypercall2(int, stack_switch, ss, esp); } +#ifdef CONFIG_X86_32 static inline int HYPERVISOR_set_callbacks(unsigned long event_selector, unsigned long event_address, @@ -162,6 +237,17 @@ HYPERVISOR_set_callbacks(unsigned long event_selector, event_selector, event_address, failsafe_selector, failsafe_address); } +#else /* CONFIG_X86_64 */ +static inline int +HYPERVISOR_set_callbacks(unsigned long event_address, + unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, + syscall_address); +} +#endif /* CONFIG_X86_{32,64} */ static inline int HYPERVISOR_callback_op(int cmd, void *arg) @@ -223,12 +309,12 @@ static inline int HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val, unsigned long flags) { - unsigned long pte_hi = 0; -#ifdef CONFIG_X86_PAE - pte_hi = new_val.pte_high; -#endif - return _hypercall4(int, update_va_mapping, va, - new_val.pte_low, pte_hi, flags); + if (sizeof(new_val) == sizeof(long)) + return _hypercall3(int, update_va_mapping, va, + new_val.pte, flags); + else + return _hypercall4(int, update_va_mapping, va, + new_val.pte, new_val.pte >> 32, flags); } static inline int @@ -281,12 +367,13 @@ static inline int HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) { - unsigned long pte_hi = 0; -#ifdef CONFIG_X86_PAE - pte_hi = new_val.pte_high; -#endif - return _hypercall5(int, update_va_mapping_otherdomain, va, - new_val.pte_low, pte_hi, flags, domid); + if (sizeof(new_val) == sizeof(long)) + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); + else + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte, new_val.pte >> 32, + flags, domid); } static inline int @@ -301,6 +388,14 @@ HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args) return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); } +#ifdef CONFIG_X86_64 +static inline int +HYPERVISOR_set_segment_base(int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} +#endif + static inline int HYPERVISOR_suspend(unsigned long srec) { @@ -327,14 +422,14 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, { mcl->op = __HYPERVISOR_update_va_mapping; mcl->args[0] = va; -#ifdef CONFIG_X86_PAE - mcl->args[1] = new_val.pte_low; - mcl->args[2] = new_val.pte_high; -#else - mcl->args[1] = new_val.pte_low; - mcl->args[2] = 0; -#endif - mcl->args[3] = flags; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; + mcl->args[3] = flags; + } } static inline void @@ -354,15 +449,16 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v { mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; mcl->args[0] = va; -#ifdef CONFIG_X86_PAE - mcl->args[1] = new_val.pte_low; - mcl->args[2] = new_val.pte_high; -#else - mcl->args[1] = new_val.pte_low; - mcl->args[2] = 0; -#endif - mcl->args[3] = flags; - mcl->args[4] = domid; + if (sizeof(new_val) == sizeof(long)) { + mcl->args[1] = new_val.pte; + mcl->args[2] = flags; + mcl->args[3] = domid; + } else { + mcl->args[1] = new_val.pte; + mcl->args[2] = new_val.pte >> 32; + mcl->args[3] = flags; + mcl->args[4] = domid; + } } static inline void @@ -370,10 +466,15 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { mcl->op = __HYPERVISOR_update_descriptor; - mcl->args[0] = maddr; - mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + if (sizeof(maddr) == sizeof(long)) { + mcl->args[0] = maddr; + mcl->args[1] = *(unsigned long *)&desc; + } else { + mcl->args[0] = maddr; + mcl->args[1] = maddr >> 32; + mcl->args[2] = desc.a; + mcl->args[3] = desc.b; + } } static inline void diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h index 6227000a1e84..9d810f2538a2 100644 --- a/include/asm-x86/xen/interface.h +++ b/include/asm-x86/xen/interface.h @@ -1,13 +1,13 @@ /****************************************************************************** * arch-x86_32.h * - * Guest OS interface to x86 32-bit Xen. + * Guest OS interface to x86 Xen. * * Copyright (c) 2004, K A Fraser */ -#ifndef __XEN_PUBLIC_ARCH_X86_32_H__ -#define __XEN_PUBLIC_ARCH_X86_32_H__ +#ifndef __ASM_X86_XEN_INTERFACE_H +#define __ASM_X86_XEN_INTERFACE_H #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ @@ -57,6 +57,17 @@ DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); #endif +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#endif + +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#endif + +/* Maximum number of virtual CPUs in multi-processor guests. */ +#define MAX_VIRT_CPUS 32 + /* * SEGMENT DESCRIPTOR TABLES */ @@ -71,58 +82,21 @@ DEFINE_GUEST_HANDLE(void); #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * These flat segments are in the Xen-private section of every GDT. Since these - * are also present in the initial GDT, many OSes will be able to avoid - * installing their own GDT. - */ -#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ -#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ -#define FLAT_RING1_SS 0xe021 /* GDT index 260 */ -#define FLAT_RING3_CS 0xe02b /* GDT index 261 */ -#define FLAT_RING3_DS 0xe033 /* GDT index 262 */ -#define FLAT_RING3_SS 0xe033 /* GDT index 262 */ - -#define FLAT_KERNEL_CS FLAT_RING1_CS -#define FLAT_KERNEL_DS FLAT_RING1_DS -#define FLAT_KERNEL_SS FLAT_RING1_SS -#define FLAT_USER_CS FLAT_RING3_CS -#define FLAT_USER_DS FLAT_RING3_DS -#define FLAT_USER_SS FLAT_RING3_SS - -/* And the trap vector is... */ -#define TRAP_INSTR "int $0x82" - -/* - * Virtual addresses beyond this are not modifiable by guest OSes. The - * machine->physical mapping table starts at this address, read-only. - */ -#ifdef CONFIG_X86_PAE -#define __HYPERVISOR_VIRT_START 0xF5800000 -#else -#define __HYPERVISOR_VIRT_START 0xFC000000 -#endif - -#ifndef HYPERVISOR_VIRT_START -#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) -#endif - -#ifndef machine_to_phys_mapping -#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) -#endif - -/* Maximum number of virtual CPUs in multi-processor guests. */ -#define MAX_VIRT_CPUS 32 - -#ifndef __ASSEMBLY__ - -/* * Send an array of these to HYPERVISOR_set_trap_table() + * The privilege level specifies which modes may enter a trap via a software + * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate + * privilege levels as follows: + * Level == 0: Noone may enter + * Level == 1: Kernel may enter + * Level == 2: Kernel may enter + * Level == 3: Everyone may enter */ #define TI_GET_DPL(_ti) ((_ti)->flags & 3) #define TI_GET_IF(_ti) ((_ti)->flags & 4) #define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2)) +#ifndef __ASSEMBLY__ struct trap_info { uint8_t vector; /* exception vector */ uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */ @@ -131,32 +105,21 @@ struct trap_info { }; DEFINE_GUEST_HANDLE_STRUCT(trap_info); -struct cpu_user_regs { - uint32_t ebx; - uint32_t ecx; - uint32_t edx; - uint32_t esi; - uint32_t edi; - uint32_t ebp; - uint32_t eax; - uint16_t error_code; /* private */ - uint16_t entry_vector; /* private */ - uint32_t eip; - uint16_t cs; - uint8_t saved_upcall_mask; - uint8_t _pad0; - uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ - uint32_t esp; - uint16_t ss, _pad1; - uint16_t es, _pad2; - uint16_t ds, _pad3; - uint16_t fs, _pad4; - uint16_t gs, _pad5; +struct arch_shared_info { + unsigned long max_pfn; /* max pfn that appears in table */ + /* Frame containing list of mfns containing list of mfns containing p2m. */ + unsigned long pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; }; -DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); +#endif /* !__ASSEMBLY__ */ -typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ +#ifdef CONFIG_X86_32 +#include "interface_32.h" +#else +#include "interface_64.h" +#endif +#ifndef __ASSEMBLY__ /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. @@ -173,33 +136,29 @@ struct vcpu_guest_context { unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */ + /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */ unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ +#ifdef __i386__ unsigned long event_callback_cs; /* CS:EIP of event callback */ unsigned long event_callback_eip; unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */ unsigned long failsafe_callback_eip; +#else + unsigned long event_callback_eip; + unsigned long failsafe_callback_eip; + unsigned long syscall_callback_eip; +#endif unsigned long vm_assist; /* VMASST_TYPE_* bitmap */ +#ifdef __x86_64__ + /* Segment base addresses. */ + uint64_t fs_base; + uint64_t gs_base_kernel; + uint64_t gs_base_user; +#endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); - -struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; -}; - -struct arch_vcpu_info { - unsigned long cr2; - unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */ -}; - -struct xen_callback { - unsigned long cs; - unsigned long eip; -}; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLY__ */ /* * Prefix forces emulation of some non-trapping instructions. @@ -213,4 +172,4 @@ struct xen_callback { #define XEN_CPUID XEN_EMULATE_PREFIX "cpuid" #endif -#endif +#endif /* __ASM_X86_XEN_INTERFACE_H */ diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h new file mode 100644 index 000000000000..d8ac41d5db86 --- /dev/null +++ b/include/asm-x86/xen/interface_32.h @@ -0,0 +1,97 @@ +/****************************************************************************** + * arch-x86_32.h + * + * Guest OS interface to x86 32-bit Xen. + * + * Copyright (c) 2004, K A Fraser + */ + +#ifndef __ASM_X86_XEN_INTERFACE_32_H +#define __ASM_X86_XEN_INTERFACE_32_H + + +/* + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ +#define FLAT_RING1_CS 0xe019 /* GDT index 259 */ +#define FLAT_RING1_DS 0xe021 /* GDT index 260 */ +#define FLAT_RING1_SS 0xe021 /* GDT index 260 */ +#define FLAT_RING3_CS 0xe02b /* GDT index 261 */ +#define FLAT_RING3_DS 0xe033 /* GDT index 262 */ +#define FLAT_RING3_SS 0xe033 /* GDT index 262 */ + +#define FLAT_KERNEL_CS FLAT_RING1_CS +#define FLAT_KERNEL_DS FLAT_RING1_DS +#define FLAT_KERNEL_SS FLAT_RING1_SS +#define FLAT_USER_CS FLAT_RING3_CS +#define FLAT_USER_DS FLAT_RING3_DS +#define FLAT_USER_SS FLAT_RING3_SS + +/* And the trap vector is... */ +#define TRAP_INSTR "int $0x82" + +/* + * Virtual addresses beyond this are not modifiable by guest OSes. The + * machine->physical mapping table starts at this address, read-only. + */ +#define __HYPERVISOR_VIRT_START 0xF5800000 + +#ifndef __ASSEMBLY__ + +struct cpu_user_regs { + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint32_t esi; + uint32_t edi; + uint32_t ebp; + uint32_t eax; + uint16_t error_code; /* private */ + uint16_t entry_vector; /* private */ + uint32_t eip; + uint16_t cs; + uint8_t saved_upcall_mask; + uint8_t _pad0; + uint32_t eflags; /* eflags.IF == !saved_upcall_mask */ + uint32_t esp; + uint16_t ss, _pad1; + uint16_t es, _pad2; + uint16_t ds, _pad3; + uint16_t fs, _pad4; + uint16_t gs, _pad5; +}; +DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); + +typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */ + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */ +}; + +struct xen_callback { + unsigned long cs; + unsigned long eip; +}; +typedef struct xen_callback xen_callback_t; + +#define XEN_CALLBACK(__cs, __eip) \ + ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) }) +#endif /* !__ASSEMBLY__ */ + + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + * + * Note that Xen is using the fact that the pagetable base is always + * page-aligned, and putting the 12 MSB of the address into the 12 LSB + * of cr3. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +#endif /* __ASM_X86_XEN_INTERFACE_32_H */ diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h new file mode 100644 index 000000000000..842266ce96e6 --- /dev/null +++ b/include/asm-x86/xen/interface_64.h @@ -0,0 +1,159 @@ +#ifndef __ASM_X86_XEN_INTERFACE_64_H +#define __ASM_X86_XEN_INTERFACE_64_H + +/* + * 64-bit segment selectors + * These flat segments are in the Xen-private section of every GDT. Since these + * are also present in the initial GDT, many OSes will be able to avoid + * installing their own GDT. + */ + +#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */ +#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */ +#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_DS64 0x0000 /* NULL selector */ +#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */ +#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */ + +#define FLAT_KERNEL_DS64 FLAT_RING3_DS64 +#define FLAT_KERNEL_DS32 FLAT_RING3_DS32 +#define FLAT_KERNEL_DS FLAT_KERNEL_DS64 +#define FLAT_KERNEL_CS64 FLAT_RING3_CS64 +#define FLAT_KERNEL_CS32 FLAT_RING3_CS32 +#define FLAT_KERNEL_CS FLAT_KERNEL_CS64 +#define FLAT_KERNEL_SS64 FLAT_RING3_SS64 +#define FLAT_KERNEL_SS32 FLAT_RING3_SS32 +#define FLAT_KERNEL_SS FLAT_KERNEL_SS64 + +#define FLAT_USER_DS64 FLAT_RING3_DS64 +#define FLAT_USER_DS32 FLAT_RING3_DS32 +#define FLAT_USER_DS FLAT_USER_DS64 +#define FLAT_USER_CS64 FLAT_RING3_CS64 +#define FLAT_USER_CS32 FLAT_RING3_CS32 +#define FLAT_USER_CS FLAT_USER_CS64 +#define FLAT_USER_SS64 FLAT_RING3_SS64 +#define FLAT_USER_SS32 FLAT_RING3_SS32 +#define FLAT_USER_SS FLAT_USER_SS64 + +#define __HYPERVISOR_VIRT_START 0xFFFF800000000000 +#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 +#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 +#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 + +#ifndef HYPERVISOR_VIRT_START +#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) +#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) +#endif + +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) +#ifndef machine_to_phys_mapping +#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#endif + +/* + * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) + * @which == SEGBASE_* ; @base == 64-bit base address + * Returns 0 on success. + */ +#define SEGBASE_FS 0 +#define SEGBASE_GS_USER 1 +#define SEGBASE_GS_KERNEL 2 +#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */ + +/* + * int HYPERVISOR_iret(void) + * All arguments are on the kernel stack, in the following format. + * Never returns if successful. Current kernel context is lost. + * The saved CS is mapped as follows: + * RING0 -> RING3 kernel mode. + * RING1 -> RING3 kernel mode. + * RING2 -> RING3 kernel mode. + * RING3 -> RING3 user mode. + * However RING0 indicates that the guest kernel should return to iteself + * directly with + * orb $3,1*8(%rsp) + * iretq + * If flags contains VGCF_in_syscall: + * Restore RAX, RIP, RFLAGS, RSP. + * Discard R11, RCX, CS, SS. + * Otherwise: + * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP. + * All other registers are saved on hypercall entry and restored to user. + */ +/* Guest exited in SYSCALL context? Return to guest with SYSRET? */ +#define _VGCF_in_syscall 8 +#define VGCF_in_syscall (1<<_VGCF_in_syscall) +#define VGCF_IN_SYSCALL VGCF_in_syscall + +#ifndef __ASSEMBLY__ + +struct iret_context { + /* Top of stack (%rsp at point of hypercall). */ + uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; + /* Bottom of iret stack frame. */ +}; + +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */ +#define __DECL_REG(name) union { \ + uint64_t r ## name, e ## name; \ + uint32_t _e ## name; \ +} +#else +/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */ +#define __DECL_REG(name) uint64_t r ## name +#endif + +struct cpu_user_regs { + uint64_t r15; + uint64_t r14; + uint64_t r13; + uint64_t r12; + __DECL_REG(bp); + __DECL_REG(bx); + uint64_t r11; + uint64_t r10; + uint64_t r9; + uint64_t r8; + __DECL_REG(ax); + __DECL_REG(cx); + __DECL_REG(dx); + __DECL_REG(si); + __DECL_REG(di); + uint32_t error_code; /* private */ + uint32_t entry_vector; /* private */ + __DECL_REG(ip); + uint16_t cs, _pad0[1]; + uint8_t saved_upcall_mask; + uint8_t _pad1[3]; + __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */ + __DECL_REG(sp); + uint16_t ss, _pad2[3]; + uint16_t es, _pad3[3]; + uint16_t ds, _pad4[3]; + uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */ + uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */ +}; +DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs); + +#undef __DECL_REG + +#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12) +#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12) + +struct arch_vcpu_info { + unsigned long cr2; + unsigned long pad; /* sizeof(vcpu_info_t) == 64 */ +}; + +typedef unsigned long xen_callback_t; + +#define XEN_CALLBACK(__cs, __rip) \ + ((unsigned long)(__rip)) + +#endif /* !__ASSEMBLY__ */ + + +#endif /* __ASM_X86_XEN_INTERFACE_64_H */ diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h index 377c04591c15..05e678a86628 100644 --- a/include/asm-x86/xen/page.h +++ b/include/asm-x86/xen/page.h @@ -148,13 +148,17 @@ static inline pte_t __pte_ma(pteval_t x) } #define pmd_val_ma(v) ((v).pmd) +#ifdef __PAGETABLE_PUD_FOLDED #define pud_val_ma(v) ((v).pgd.pgd) +#else +#define pud_val_ma(v) ((v).pud) +#endif #define __pmd_ma(x) ((pmd_t) { (x) } ) #define pgd_val_ma(x) ((x).pgd) -xmaddr_t arbitrary_virt_to_machine(unsigned long address); +xmaddr_t arbitrary_virt_to_machine(void *address); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h index 98b79bc404dd..c3adde32669b 100644 --- a/include/xen/hvc-console.h +++ b/include/xen/hvc-console.h @@ -5,11 +5,12 @@ extern struct console xenboot_console; #ifdef CONFIG_HVC_XEN void xen_console_resume(void); +void xen_raw_console_write(const char *str); +void xen_raw_printk(const char *fmt, ...); #else static inline void xen_console_resume(void) { } +static inline void xen_raw_console_write(const char *str) { } +static inline void xen_raw_printk(const char *fmt, ...) { } #endif -void xen_raw_console_write(const char *str); -void xen_raw_printk(const char *fmt, ...); - #endif /* XEN_HVC_CONSOLE_H */ diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h index 4aadcba31af9..2ae3cd243264 100644 --- a/include/xen/interface/callback.h +++ b/include/xen/interface/callback.h @@ -82,9 +82,9 @@ */ #define CALLBACKOP_register 0 struct callback_register { - uint16_t type; - uint16_t flags; - struct xen_callback address; + uint16_t type; + uint16_t flags; + xen_callback_t address; }; /* diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index a706d6a78960..883a21bba24b 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -11,4 +11,7 @@ void xen_post_suspend(int suspend_cancelled); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +void xen_timer_resume(void); +void xen_arch_resume(void); + #endif /* INCLUDE_XEN_OPS_H */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index b45da40e8d25..59dfdf1e1d20 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -82,7 +82,7 @@ config PM_SLEEP_SMP config PM_SLEEP bool - depends on SUSPEND || HIBERNATION + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y config SUSPEND |