14 files changed, 585 insertions, 426 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 2e641be2737e..c2cc99580871 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,9 +5,20 @@
 config XEN
 	bool "Xen guest support"
 	select PARAVIRT
+	select PARAVIRT_CLOCK
 	depends on X86_32
-	depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
+	depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
 	help
 	  This is the Linux Xen port.  Enabling this will allow the
 	  kernel to boot in a paravirtualized environment under the
 	  Xen hypervisor.
+
+config XEN_MAX_DOMAIN_MEMORY
+       int "Maximum allowed size of a domain in gigabytes"
+       default 8
+       depends on XEN
+       help
+         The pseudo-physical to machine address array is sized
+         according to the maximum possible memory size of a Xen
+         domain.  This array uses 1 page per gigabyte, so there's no
+         need to be too stingy here.
+\ No newline at end of file
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3d8df981d5fd..2ba2d1649131 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o \
-			time.o manage.o xen-asm.o grant-table.o
+			time.o xen-asm.o grant-table.o suspend.o
 
 obj-$(CONFIG_SMP)	+= smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e457d61..bd74229081c3 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,13 +75,13 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
-static /* __initdata */ struct shared_info dummy_shared_info;
+struct shared_info xen_dummy_shared_info;
 
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
  */
-struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
 
 /*
  * Flag to determine whether vcpu info placement is available on all
@@ -98,13 +98,13 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info;
  */
 static int have_vcpu_info_placement = 1;
 
-static void __init xen_vcpu_setup(int cpu)
+static void xen_vcpu_setup(int cpu)
 {
 	struct vcpu_register_vcpu_info info;
 	int err;
 	struct vcpu_info *vcpup;
 
-	BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info);
+	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 	per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
 
 	if (!have_vcpu_info_placement)
@@ -136,11 +136,41 @@ static void __init xen_vcpu_setup(int cpu)
 	}
 }
 
+/*
+ * On restore, set the vcpu placement up again.
+ * If it fails, then we're in a bad state, since
+ * we can't back out from using it...
+ */
+void xen_vcpu_restore(void)
+{
+	if (have_vcpu_info_placement) {
+		int cpu;
+
+		for_each_online_cpu(cpu) {
+			bool other_cpu = (cpu != smp_processor_id());
+
+			if (other_cpu &&
+			    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
+				BUG();
+
+			xen_vcpu_setup(cpu);
+
+			if (other_cpu &&
+			    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
+				BUG();
+		}
+
+		BUG_ON(!have_vcpu_info_placement);
+	}
+}
+
 static void __init xen_banner(void)
 {
 	printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
 	       pv_info.name);
-	printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+	printk(KERN_INFO "Hypervisor signature: %s%s\n",
+	       xen_start_info->magic,
+	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
@@ -235,13 +265,13 @@ static void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
 
-	/* There's a one instruction preempt window here.  We need to
-	   make sure we're don't switch CPUs between getting the vcpu
-	   pointer and updating the mask. */
-	preempt_disable();
+	/* We don't need to worry about being preempted here, since
+	   either a) interrupts are disabled, so no preemption, or b)
+	   the caller is confused and is trying to re-enable interrupts
+	   on an indeterminate processor. */
+
 	vcpu = x86_read_percpu(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
-	preempt_enable_no_resched();
 
 	/* Doesn't matter if we get preempted here, because any
 	   pending event will get dealt with anyway. */
@@ -254,7 +284,7 @@ static void xen_irq_enable(void)
 static void xen_safe_halt(void)
 {
 	/* Blocking includes an implicit local_irq_enable(). */
-	if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+	if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
 		BUG();
 }
 
@@ -607,6 +637,30 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
+static void xen_clts(void)
+{
+	struct multicall_space mcs;
+
+	mcs = xen_mc_entry(0);
+
+	MULTI_fpu_taskswitch(mcs.mc, 0);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
+static void xen_write_cr0(unsigned long cr0)
+{
+	struct multicall_space mcs;
+
+	/* Only pay attention to cr0.TS; everything else is
+	   ignored. */
+	mcs = xen_mc_entry(0);
+
+	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+}
+
 static void xen_write_cr2(unsigned long cr2)
 {
 	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
@@ -624,8 +678,10 @@ static unsigned long xen_read_cr2_direct(void)
 
 static void xen_write_cr4(unsigned long cr4)
 {
-	/* Just ignore cr4 changes; Xen doesn't allow us to do
-	   anything anyway. */
+	cr4 &= ~X86_CR4_PGE;
+	cr4 &= ~X86_CR4_PSE;
+
+	native_write_cr4(cr4);
 }
 
 static unsigned long xen_read_cr3(void)
@@ -785,38 +841,35 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 static __init void xen_pagetable_setup_start(pgd_t *base)
 {
 	pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+	int i;
 
 	/* special set_pte for pagetable initialization */
 	pv_mmu_ops.set_pte = xen_set_pte_init;
 
 	init_mm.pgd = base;
 	/*
-	 * copy top-level of Xen-supplied pagetable into place.	 For
-	 * !PAE we can use this as-is, but for PAE it is a stand-in
-	 * while we copy the pmd pages.
+	 * copy top-level of Xen-supplied pagetable into place.  This
+	 * is a stand-in while we copy the pmd pages.
 	 */
 	memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
 
-	if (PTRS_PER_PMD > 1) {
-		int i;
-		/*
-		 * For PAE, need to allocate new pmds, rather than
-		 * share Xen's, since Xen doesn't like pmd's being
-		 * shared between address spaces.
-		 */
-		for (i = 0; i < PTRS_PER_PGD; i++) {
-			if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
-				pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
+	/*
+	 * For PAE, need to allocate new pmds, rather than
+	 * share Xen's, since Xen doesn't like pmd's being
+	 * shared between address spaces.
+	 */
+	for (i = 0; i < PTRS_PER_PGD; i++) {
+		if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+			pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 
-				memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
-				       PAGE_SIZE);
+			memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+			       PAGE_SIZE);
 
-				make_lowmem_page_readonly(pmd);
+			make_lowmem_page_readonly(pmd);
 
-				set_pgd(&base[i], __pgd(1 + __pa(pmd)));
-			} else
-				pgd_clear(&base[i]);
-		}
+			set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+		} else
+			pgd_clear(&base[i]);
 	}
 
 	/* make sure zero_page is mapped RO so we can use it in pagetables */
@@ -834,7 +887,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
 			  PFN_DOWN(__pa(xen_start_info->pt_base)));
 }
 
-static __init void setup_shared_info(void)
+void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
@@ -857,6 +910,8 @@ static __init void setup_shared_info(void)
 	/* In UP this is as good a place as any to set up shared info */
 	xen_setup_vcpu_info_placement();
 #endif
+
+	xen_setup_mfn_list_list();
 }
 
 static __init void xen_pagetable_setup_done(pgd_t *base)
@@ -869,25 +924,23 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
 	pv_mmu_ops.release_pmd = xen_release_pmd;
 	pv_mmu_ops.set_pte = xen_set_pte;
 
-	setup_shared_info();
+	xen_setup_shared_info();
 
 	/* Actually pin the pagetable down, but we can't set PG_pinned
 	   yet because the page structures don't exist yet. */
-	{
-		unsigned level;
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
+}
 
-#ifdef CONFIG_X86_PAE
-		level = MMUEXT_PIN_L3_TABLE;
-#else
-		level = MMUEXT_PIN_L2_TABLE;
-#endif
+static __init void xen_post_allocator_init(void)
+{
+	pv_mmu_ops.set_pmd = xen_set_pmd;
+	pv_mmu_ops.set_pud = xen_set_pud;
 
-		pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
-	}
+	xen_mark_init_mm_pinned();
 }
 
 /* This is called once we have the cpu_possible_map */
-void __init xen_setup_vcpu_info_placement(void)
+void xen_setup_vcpu_info_placement(void)
 {
 	int cpu;
 
@@ -973,7 +1026,7 @@ static const struct pv_init_ops xen_init_ops __initdata = {
 	.banner = xen_banner,
 	.memory_setup = xen_memory_setup,
 	.arch_setup = xen_arch_setup,
-	.post_allocator_init = xen_mark_init_mm_pinned,
+	.post_allocator_init = xen_post_allocator_init,
 };
 
 static const struct pv_time_ops xen_time_ops __initdata = {
@@ -991,10 +1044,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.set_debugreg = xen_set_debugreg,
 	.get_debugreg = xen_get_debugreg,
 
-	.clts = native_clts,
+	.clts = xen_clts,
 
 	.read_cr0 = native_read_cr0,
-	.write_cr0 = native_write_cr0,
+	.write_cr0 = xen_write_cr0,
 
 	.read_cr4 = native_read_cr4,
 	.read_cr4_safe = native_read_cr4_safe,
@@ -1085,24 +1138,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.set_pte = NULL,	/* see xen_pagetable_setup_* */
 	.set_pte_at = xen_set_pte_at,
-	.set_pmd = xen_set_pmd,
+	.set_pmd = xen_set_pmd_hyper,
+
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
 	.pte_val = xen_pte_val,
+	.pte_flags = native_pte_val,
 	.pgd_val = xen_pgd_val,
 
 	.make_pte = xen_make_pte,
 	.make_pgd = xen_make_pgd,
 
-#ifdef CONFIG_X86_PAE
 	.set_pte_atomic = xen_set_pte_atomic,
 	.set_pte_present = xen_set_pte_at,
-	.set_pud = xen_set_pud,
+	.set_pud = xen_set_pud_hyper,
 	.pte_clear = xen_pte_clear,
 	.pmd_clear = xen_pmd_clear,
 
 	.make_pmd = xen_make_pmd,
 	.pmd_val = xen_pmd_val,
-#endif	/* PAE */
 
 	.activate_mm = xen_activate_mm,
 	.dup_mmap = xen_dup_mmap,
@@ -1129,11 +1184,13 @@ static const struct smp_ops xen_smp_ops __initdata = {
 
 static void xen_reboot(int reason)
 {
+	struct sched_shutdown r = { .reason = reason };
+
 #ifdef CONFIG_SMP
 	smp_send_stop();
 #endif
 
-	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason))
+	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
 		BUG();
 }
 
@@ -1188,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 	BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
 
+	xen_setup_features();
+
 	/* Install Xen paravirt ops */
 	pv_info = xen_info;
 	pv_init_ops = xen_init_ops;
@@ -1197,17 +1256,20 @@ asmlinkage void __init xen_start_kernel(void)
 	pv_apic_ops = xen_apic_ops;
 	pv_mmu_ops = xen_mmu_ops;
 
+	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
+		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
+		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
+	}
+
 	machine_ops = xen_machine_ops;
 
 #ifdef CONFIG_SMP
 	smp_ops = xen_smp_ops;
 #endif
 
-	xen_setup_features();
-
 	/* Get mfn list */
 	if (!xen_feature(XENFEAT_auto_translated_physmap))
-		phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list;
+		xen_build_dynamic_phys_to_machine();
 
 	pgd = (pgd_t *)xen_start_info->pt_base;
 
@@ -1228,6 +1290,11 @@ asmlinkage void __init xen_start_kernel(void)
 	if (xen_feature(XENFEAT_supervisor_mode_kernel))
 		pv_info.kernel_rpl = 0;
 
+	/* Prevent unwanted bits from being set in PTEs. */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
+	if (!is_initial_xendomain())
+		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
+
 	/* set the limit of our address space */
 	xen_reserve_top();
 
@@ -1242,8 +1309,11 @@ asmlinkage void __init xen_start_kernel(void)
 		? __pa(xen_start_info->mod_start) : 0;
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 
-	if (!is_initial_xendomain())
+	if (!is_initial_xendomain()) {
+		add_preferred_console("xenboot", 0, NULL);
+		add_preferred_console("tty", 0, NULL);
 		add_preferred_console("hvc", 0, NULL);
+	}
 
 	/* Start the world */
 	start_kernel();
diff --git a/arch/x86/xen/manage.c b/arch/x86/xen/manage.c
deleted file mode 100644
index aa7af9e6abc0..000000000000
--- a/arch/x86/xen/manage.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Handle extern requests for shutdown, reboot and sysrq
- */
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/reboot.h>
-#include <linux/sysrq.h>
-
-#include <xen/xenbus.h>
-
-#define SHUTDOWN_INVALID  -1
-#define SHUTDOWN_POWEROFF  0
-#define SHUTDOWN_SUSPEND   2
-/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only
- * report a crash, not be instructed to crash!
- * HALT is the same as POWEROFF, as far as we're concerned.  The tools use
- * the distinction when we return the reason code to them.
- */
-#define SHUTDOWN_HALT      4
-
-/* Ignore multiple shutdown requests. */
-static int shutting_down = SHUTDOWN_INVALID;
-
-static void shutdown_handler(struct xenbus_watch *watch,
-			     const char **vec, unsigned int len)
-{
-	char *str;
-	struct xenbus_transaction xbt;
-	int err;
-
-	if (shutting_down != SHUTDOWN_INVALID)
-		return;
-
- again:
-	err = xenbus_transaction_start(&xbt);
-	if (err)
-		return;
-
-	str = (char *)xenbus_read(xbt, "control", "shutdown", NULL);
-	/* Ignore read errors and empty reads. */
-	if (XENBUS_IS_ERR_READ(str)) {
-		xenbus_transaction_end(xbt, 1);
-		return;
-	}
-
-	xenbus_write(xbt, "control", "shutdown", "");
-
-	err = xenbus_transaction_end(xbt, 0);
-	if (err == -EAGAIN) {
-		kfree(str);
-		goto again;
-	}
-
-	if (strcmp(str, "poweroff") == 0 ||
-	    strcmp(str, "halt") == 0)
-		orderly_poweroff(false);
-	else if (strcmp(str, "reboot") == 0)
-		ctrl_alt_del();
-	else {
-		printk(KERN_INFO "Ignoring shutdown request: %s\n", str);
-		shutting_down = SHUTDOWN_INVALID;
-	}
-
-	kfree(str);
-}
-
-static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
-			  unsigned int len)
-{
-	char sysrq_key = '\0';
-	struct xenbus_transaction xbt;
-	int err;
-
- again:
-	err = xenbus_transaction_start(&xbt);
-	if (err)
-		return;
-	if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) {
-		printk(KERN_ERR "Unable to read sysrq code in "
-		       "control/sysrq\n");
-		xenbus_transaction_end(xbt, 1);
-		return;
-	}
-
-	if (sysrq_key != '\0')
-		xenbus_printf(xbt, "control", "sysrq", "%c", '\0');
-
-	err = xenbus_transaction_end(xbt, 0);
-	if (err == -EAGAIN)
-		goto again;
-
-	if (sysrq_key != '\0')
-		handle_sysrq(sysrq_key, NULL);
-}
-
-static struct xenbus_watch shutdown_watch = {
-	.node = "control/shutdown",
-	.callback = shutdown_handler
-};
-
-static struct xenbus_watch sysrq_watch = {
-	.node = "control/sysrq",
-	.callback = sysrq_handler
-};
-
-static int setup_shutdown_watcher(void)
-{
-	int err;
-
-	err = register_xenbus_watch(&shutdown_watch);
-	if (err) {
-		printk(KERN_ERR "Failed to set shutdown watcher\n");
-		return err;
-	}
-
-	err = register_xenbus_watch(&sysrq_watch);
-	if (err) {
-		printk(KERN_ERR "Failed to set sysrq watcher\n");
-		return err;
-	}
-
-	return 0;
-}
-
-static int shutdown_event(struct notifier_block *notifier,
-			  unsigned long event,
-			  void *data)
-{
-	setup_shutdown_watcher();
-	return NOTIFY_DONE;
-}
-
-static int __init setup_shutdown_event(void)
-{
-	static struct notifier_block xenstore_notifier = {
-		.notifier_call = shutdown_event
-	};
-	register_xenstore_notifier(&xenstore_notifier);
-
-	return 0;
-}
-
-subsys_initcall(setup_shutdown_event);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 126766d43aea..42b3b9ed641d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -56,11 +56,136 @@
 #include "multicalls.h"
 #include "mmu.h"
 
+#define P2M_ENTRIES_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long))
+#define TOP_ENTRIES		(MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
+
+/* Placeholder for holes in the address space */
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
+	__attribute__((section(".data.page_aligned"))) =
+		{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
+
+ /* Array of pointers to pages containing p2m entries */
+static unsigned long *p2m_top[TOP_ENTRIES]
+	__attribute__((section(".data.page_aligned"))) =
+		{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
+
+/* Arrays of p2m arrays expressed in mfns used for save/restore */
+static unsigned long p2m_top_mfn[TOP_ENTRIES]
+	__attribute__((section(".bss.page_aligned")));
+
+static unsigned long p2m_top_mfn_list[
+			PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
+	__attribute__((section(".bss.page_aligned")));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+	BUG_ON(pfn >= MAX_DOMAIN_PAGES);
+	return pfn / P2M_ENTRIES_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+	return pfn % P2M_ENTRIES_PER_PAGE;
+}
+
+/* Build the parallel p2m_top_mfn structures */
+void xen_setup_mfn_list_list(void)
+{
+	unsigned pfn, idx;
+
+	for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+
+		p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
+	}
+
+	for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
+		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
+	}
+
+	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+		virt_to_mfn(p2m_top_mfn_list);
+	HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+	unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+	unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+	unsigned pfn;
+
+	for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+		unsigned topidx = p2m_top_index(pfn);
+
+		p2m_top[topidx] = &mfn_list[pfn];
+	}
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+	unsigned topidx, idx;
+
+	if (unlikely(pfn >= MAX_DOMAIN_PAGES))
+		return INVALID_P2M_ENTRY;
+
+	topidx = p2m_top_index(pfn);
+	idx = p2m_index(pfn);
+	return p2m_top[topidx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+{
+	unsigned long *p;
+	unsigned i;
+
+	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(p == NULL);
+
+	for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+		p[i] = INVALID_P2M_ENTRY;
+
+	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
+		free_page((unsigned long)p);
+	else
+		*mfnp = virt_to_mfn(p);
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	unsigned topidx, idx;
+
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+
+	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
+		BUG_ON(mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+
+	topidx = p2m_top_index(pfn);
+	if (p2m_top[topidx] == p2m_missing) {
+		/* no need to allocate a page to store an invalid entry */
+		if (mfn == INVALID_P2M_ENTRY)
+			return;
+		alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+	}
+
+	idx = p2m_index(pfn);
+	p2m_top[topidx][idx] = mfn;
+}
+
 xmaddr_t arbitrary_virt_to_machine(unsigned long address)
 {
 	unsigned int level;
 	pte_t *pte = lookup_address(address, &level);
-	unsigned offset = address & PAGE_MASK;
+	unsigned offset = address & ~PAGE_MASK;
 
 	BUG_ON(pte == NULL);
 
@@ -98,24 +223,60 @@ void make_lowmem_page_readwrite(void *vaddr)
 }
 
 
-void xen_set_pmd(pmd_t *ptr, pmd_t val)
+static bool page_pinned(void *ptr)
+{
+	struct page *page = virt_to_page(ptr);
+
+	return PagePinned(page);
+}
+
+static void extend_mmu_update(const struct mmu_update *update)
 {
 	struct multicall_space mcs;
 	struct mmu_update *u;
 
-	preempt_disable();
+	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
+
+	if (mcs.mc != NULL)
+		mcs.mc->args[1]++;
+	else {
+		mcs = __xen_mc_entry(sizeof(*u));
+		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+	}
 
-	mcs = xen_mc_entry(sizeof(*u));
 	u = mcs.args;
-	u->ptr = virt_to_machine(ptr).maddr;
-	u->val = pmd_val_ma(val);
-	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+	*u = *update;
+}
+
+void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
+{
+	struct mmu_update u;
+
+	preempt_disable();
+
+	xen_mc_batch();
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pmd_val_ma(val);
+	extend_mmu_update(&u);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
 	preempt_enable();
 }
 
+void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+	/* If page is not pinned, we can just update the entry
+	   directly */
+	if (!page_pinned(ptr)) {
+		*ptr = val;
+		return;
+	}
+
+	xen_set_pmd_hyper(ptr, val);
+}
+
 /*
  * Associate a virtual page frame with a given physical page frame
  * and protection flags for that frame.
@@ -179,68 +340,105 @@ out:
 		preempt_enable();
 }
 
-pteval_t xen_pte_val(pte_t pte)
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
-	pteval_t ret = pte.pte;
+	/* Just return the pte as-is.  We preserve the bits on commit */
+	return *ptep;
+}
+
+void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+				 pte_t *ptep, pte_t pte)
+{
+	struct mmu_update u;
+
+	xen_mc_batch();
 
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
+	u.val = pte_val_ma(pte);
+	extend_mmu_update(&u);
 
-	return ret;
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
-pgdval_t xen_pgd_val(pgd_t pgd)
+/* Assume pteval_t is equivalent to all the other *val_t types. */
+static pteval_t pte_mfn_to_pfn(pteval_t val)
 {
-	pgdval_t ret = pgd.pgd;
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-	return ret;
+	if (val & _PAGE_PRESENT) {
+		unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & ~PTE_MASK;
+		val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
+	}
+
+	return val;
 }
 
-pte_t xen_make_pte(pteval_t pte)
+static pteval_t pte_pfn_to_mfn(pteval_t val)
 {
-	if (pte & _PAGE_PRESENT) {
-		pte = phys_to_machine(XPADDR(pte)).maddr;
-		pte &= ~(_PAGE_PCD | _PAGE_PWT);
+	if (val & _PAGE_PRESENT) {
+		unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT;
+		pteval_t flags = val & ~PTE_MASK;
+		val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
 	}
 
-	return (pte_t){ .pte = pte };
+	return val;
 }
 
-pgd_t xen_make_pgd(pgdval_t pgd)
+pteval_t xen_pte_val(pte_t pte)
 {
-	if (pgd & _PAGE_PRESENT)
-		pgd = phys_to_machine(XPADDR(pgd)).maddr;
+	return pte_mfn_to_pfn(pte.pte);
+}
 
-	return (pgd_t){ pgd };
+pgdval_t xen_pgd_val(pgd_t pgd)
+{
+	return pte_mfn_to_pfn(pgd.pgd);
+}
+
+pte_t xen_make_pte(pteval_t pte)
+{
+	pte = pte_pfn_to_mfn(pte);
+	return native_make_pte(pte);
+}
+
+pgd_t xen_make_pgd(pgdval_t pgd)
+{
+	pgd = pte_pfn_to_mfn(pgd);
+	return native_make_pgd(pgd);
 }
 
 pmdval_t xen_pmd_val(pmd_t pmd)
 {
-	pmdval_t ret = native_pmd_val(pmd);
-	if (ret & _PAGE_PRESENT)
-		ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-	return ret;
+	return pte_mfn_to_pfn(pmd.pmd);
 }
-#ifdef CONFIG_X86_PAE
-void xen_set_pud(pud_t *ptr, pud_t val)
+
+void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 {
-	struct multicall_space mcs;
-	struct mmu_update *u;
+	struct mmu_update u;
 
 	preempt_disable();
 
-	mcs = xen_mc_entry(sizeof(*u));
-	u = mcs.args;
-	u->ptr = virt_to_machine(ptr).maddr;
-	u->val = pud_val_ma(val);
-	MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF);
+	xen_mc_batch();
+
+	u.ptr = virt_to_machine(ptr).maddr;
+	u.val = pud_val_ma(val);
+	extend_mmu_update(&u);
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 
 	preempt_enable();
 }
 
+void xen_set_pud(pud_t *ptr, pud_t val)
+{
+	/* If page is not pinned, we can just update the entry
+	   directly */
+	if (!page_pinned(ptr)) {
+		*ptr = val;
+		return;
+	}
+
+	xen_set_pud_hyper(ptr, val);
+}
+
 void xen_set_pte(pte_t *ptep, pte_t pte)
 {
 	ptep->pte_high = pte.pte_high;
@@ -262,22 +460,14 @@ void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 
 void xen_pmd_clear(pmd_t *pmdp)
 {
-	xen_set_pmd(pmdp, __pmd(0));
+	set_pmd(pmdp, __pmd(0));
 }
 
 pmd_t xen_make_pmd(pmdval_t pmd)
 {
-	if (pmd & _PAGE_PRESENT)
-		pmd = phys_to_machine(XPADDR(pmd)).maddr;
-
+	pmd = pte_pfn_to_mfn(pmd);
 	return native_make_pmd(pmd);
 }
-#else  /* !PAE */
-void xen_set_pte(pte_t *ptep, pte_t pte)
-{
-	*ptep = pte;
-}
-#endif	/* CONFIG_X86_PAE */
 
 /*
   (Yet another) pagetable walker.  This one is intended for pinning a
@@ -430,8 +620,6 @@ static int pin_page(struct page *page, enum pt_level level)
    read-only, and can be pinned. */
 void xen_pgd_pin(pgd_t *pgd)
 {
-	unsigned level;
-
 	xen_mc_batch();
 
 	if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
@@ -441,15 +629,31 @@ void xen_pgd_pin(pgd_t *pgd)
 		xen_mc_batch();
 	}
 
-#ifdef CONFIG_X86_PAE
-	level = MMUEXT_PIN_L3_TABLE;
-#else
-	level = MMUEXT_PIN_L2_TABLE;
-#endif
+	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+	xen_mc_issue(0);
+}
+
+/*
+ * On save, we need to pin all pagetables to make sure they get their
+ * mfns turned into pfns.  Search the list for any unpinned pgds and pin
+ * them (unpinned pgds are not currently in use, probably because the
+ * process is under construction or destruction).
+ */
+void xen_mm_pin_all(void)
+{
+	unsigned long flags;
+	struct page *page;
 
-	xen_do_pin(level, PFN_DOWN(__pa(pgd)));
+	spin_lock_irqsave(&pgd_lock, flags);
 
-	xen_mc_issue(0);
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (!PagePinned(page)) {
+			xen_pgd_pin((pgd_t *)page_address(page));
+			SetPageSavePinned(page);
+		}
+	}
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
 }
 
 /* The init_mm pagetable is really pinned as soon as its created, but
@@ -509,6 +713,29 @@ static void xen_pgd_unpin(pgd_t *pgd)
 	xen_mc_issue(0);
 }
 
+/*
+ * On resume, undo any pinning done at save, so that the rest of the
+ * kernel doesn't see any unexpected pinned pagetables.
+ */
+void xen_mm_unpin_all(void)
+{
+	unsigned long flags;
+	struct page *page;
+
+	spin_lock_irqsave(&pgd_lock, flags);
+
+	list_for_each_entry(page, &pgd_list, lru) {
+		if (PageSavePinned(page)) {
+			BUG_ON(!PagePinned(page));
+			printk("unpinning pinned %p\n", page_address(page));
+			xen_pgd_unpin((pgd_t *)page_address(page));
+			ClearPageSavePinned(page);
+		}
+	}
+
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
 	spin_lock(&next->page_table_lock);
@@ -602,7 +829,7 @@ void xen_exit_mmap(struct mm_struct *mm)
 	spin_lock(&mm->page_table_lock);
 
 	/* pgd may not be pinned in the error exit path of execve */
-	if (PagePinned(virt_to_page(mm->pgd)))
+	if (page_pinned(mm->pgd))
 		xen_pgd_unpin(mm->pgd);
 
 	spin_unlock(&mm->page_table_lock);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index b5e189b1519d..297bf9f5b8bc 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -25,10 +25,6 @@ enum pt_level {
 
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-void xen_set_pte(pte_t *ptep, pte_t pteval);
-void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
-		    pte_t *ptep, pte_t pteval);
-void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
 
 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
@@ -37,31 +33,27 @@ void xen_exit_mmap(struct mm_struct *mm);
 void xen_pgd_pin(pgd_t *pgd);
 //void xen_pgd_unpin(pgd_t *pgd);
 
-#ifdef CONFIG_X86_PAE
-unsigned long long xen_pte_val(pte_t);
-unsigned long long xen_pmd_val(pmd_t);
-unsigned long long xen_pgd_val(pgd_t);
+pteval_t xen_pte_val(pte_t);
+pmdval_t xen_pmd_val(pmd_t);
+pgdval_t xen_pgd_val(pgd_t);
 
-pte_t xen_make_pte(unsigned long long);
-pmd_t xen_make_pmd(unsigned long long);
-pgd_t xen_make_pgd(unsigned long long);
+pte_t xen_make_pte(pteval_t);
+pmd_t xen_make_pmd(pmdval_t);
+pgd_t xen_make_pgd(pgdval_t);
 
+void xen_set_pte(pte_t *ptep, pte_t pteval);
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval);
 void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
 void xen_set_pud(pud_t *ptr, pud_t val);
+void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
+void xen_set_pud_hyper(pud_t *ptr, pud_t val);
 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
 void xen_pmd_clear(pmd_t *pmdp);
 
-
-#else
-unsigned long xen_pte_val(pte_t);
-unsigned long xen_pmd_val(pmd_t);
-unsigned long xen_pgd_val(pgd_t);
-
-pte_t xen_make_pte(unsigned long);
-pmd_t xen_make_pmd(unsigned long);
-pgd_t xen_make_pgd(unsigned long);
-#endif
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+				  pte_t *ptep, pte_t pte);
 
 #endif	/* _XEN_MMU_H */
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5791eb2e3750..3c63c4da7ed1 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -29,14 +29,14 @@
 #define MC_DEBUG	1
 
 #define MC_BATCH	32
-#define MC_ARGS		(MC_BATCH * 16 / sizeof(u64))
+#define MC_ARGS		(MC_BATCH * 16)
 
 struct mc_buffer {
 	struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
 	struct multicall_entry debug[MC_BATCH];
 #endif
-	u64 args[MC_ARGS];
+	unsigned char args[MC_ARGS];
 	struct callback {
 		void (*fn)(void *);
 		void *data;
@@ -107,20 +107,48 @@ struct multicall_space __xen_mc_entry(size_t args)
 {
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
 	struct multicall_space ret;
-	unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+	unsigned argidx = roundup(b->argidx, sizeof(u64));
 
 	BUG_ON(preemptible());
-	BUG_ON(argspace > MC_ARGS);
+	BUG_ON(b->argidx > MC_ARGS);
 
 	if (b->mcidx == MC_BATCH ||
-	    (b->argidx + argspace) > MC_ARGS)
+	    (argidx + args) > MC_ARGS) {
 		xen_mc_flush();
+		argidx = roundup(b->argidx, sizeof(u64));
+	}
 
 	ret.mc = &b->entries[b->mcidx];
 	b->mcidx++;
+	ret.args = &b->args[argidx];
+	b->argidx = argidx + args;
+
+	BUG_ON(b->argidx > MC_ARGS);
+	return ret;
+}
+
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
+{
+	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct multicall_space ret = { NULL, NULL };
+
+	BUG_ON(preemptible());
+	BUG_ON(b->argidx > MC_ARGS);
+
+	if (b->mcidx == 0)
+		return ret;
+
+	if (b->entries[b->mcidx - 1].op != op)
+		return ret;
+
+	if ((b->argidx + size) > MC_ARGS)
+		return ret;
+
+	ret.mc = &b->entries[b->mcidx - 1];
 	ret.args = &b->args[b->argidx];
-	b->argidx += argspace;
+	b->argidx += size;
 
+	BUG_ON(b->argidx > MC_ARGS);
 	return ret;
 }
 
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 8bae996d99a3..858938241616 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -45,4 +45,16 @@ static inline void xen_mc_issue(unsigned mode)
 /* Set up a callback to be called when the current batch is flushed */
 void xen_mc_callback(void (*fn)(void *), void *data);
 
+/*
+ * Try to extend the arguments of the previous multicall command.  The
+ * previous command's op must match.  If it does, then it attempts to
+ * extend the argument space allocated to the multicall entry by
+ * arg_size bytes.
+ *
+ * The returned multicall_space will return with mc pointing to the
+ * command on success, or NULL on failure, and args pointing to the
+ * newly allocated space.
+ */
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
+
 #endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 82517e4a752a..488447878a9d 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -16,6 +16,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
+#include <xen/page.h>
 #include <xen/interface/callback.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
@@ -27,8 +28,6 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
-unsigned long *phys_to_machine_mapping;
-EXPORT_SYMBOL(phys_to_machine_mapping);
 
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
@@ -38,6 +37,8 @@ char * __init xen_memory_setup(void)
 {
 	unsigned long max_pfn = xen_start_info->nr_pages;
 
+	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+
 	e820.nr_map = 0;
 	add_memory_region(0, LOWMEMSIZE(), E820_RAM);
 	add_memory_region(HIGH_MEMORY, PFN_PHYS(max_pfn)-HIGH_MEMORY, E820_RAM);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..d2e3c20127d7 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,7 +35,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-static cpumask_t xen_cpu_initialized_map;
+cpumask_t xen_cpu_initialized_map;
 static DEFINE_PER_CPU(int, resched_irq) = -1;
 static DEFINE_PER_CPU(int, callfunc_irq) = -1;
 static DEFINE_PER_CPU(int, debug_irq) = -1;
@@ -65,6 +65,12 @@ static struct call_data_struct *call_data;
  */
 static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 {
+#ifdef CONFIG_X86_32
+	__get_cpu_var(irq_stat).irq_resched_count++;
+#else
+	add_pda(irq_resched_count, 1);
+#endif
+
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
new file mode 100644
index 000000000000..251669a932d4
--- /dev/null
+++ b/arch/x86/xen/suspend.c
@@ -0,0 +1,45 @@
+#include <linux/types.h>
+
+#include <xen/interface/xen.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+
+void xen_pre_suspend(void)
+{
+	xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
+	xen_start_info->console.domU.mfn =
+		mfn_to_pfn(xen_start_info->console.domU.mfn);
+
+	BUG_ON(!irqs_disabled());
+
+	HYPERVISOR_shared_info = &xen_dummy_shared_info;
+	if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
+					 __pte_ma(0), 0))
+		BUG();
+}
+
+void xen_post_suspend(int suspend_cancelled)
+{
+	xen_setup_shared_info();
+
+	if (suspend_cancelled) {
+		xen_start_info->store_mfn =
+			pfn_to_mfn(xen_start_info->store_mfn);
+		xen_start_info->console.domU.mfn =
+			pfn_to_mfn(xen_start_info->console.domU.mfn);
+	} else {
+#ifdef CONFIG_SMP
+		xen_cpu_initialized_map = cpu_online_map;
+#endif
+		xen_vcpu_restore();
+		xen_timer_resume();
+	}
+
+}
+
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index c39e1a5aa241..64f0038b9558 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -12,7 +12,9 @@
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
 #include <linux/kernel_stat.h>
+#include <linux/math64.h>
 
+#include <asm/pvclock.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 
@@ -30,17 +32,6 @@
 
 static cycle_t xen_clocksource_read(void);
 
-/* These are perodically updated in shared_info, and then copied here. */
-struct shadow_time_info {
-	u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-	u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-	u32 tsc_to_nsec_mul;
-	int tsc_shift;
-	u32 version;
-};
-
-static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
-
 /* runstate info updated by Xen */
 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
 
@@ -150,11 +141,7 @@ static void do_stolen_accounting(void)
 	if (stolen < 0)
 		stolen = 0;
 
-	ticks = 0;
-	while (stolen >= NS_PER_TICK) {
-		ticks++;
-		stolen -= NS_PER_TICK;
-	}
+	ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
 	__get_cpu_var(residual_stolen) = stolen;
 	account_steal_time(NULL, ticks);
 
@@ -166,11 +153,7 @@ static void do_stolen_accounting(void)
 	if (blocked < 0)
 		blocked = 0;
 
-	ticks = 0;
-	while (blocked >= NS_PER_TICK) {
-		ticks++;
-		blocked -= NS_PER_TICK;
-	}
+	ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
 	__get_cpu_var(residual_blocked) = blocked;
 	account_steal_time(idle_task(smp_processor_id()), ticks);
 }
@@ -218,7 +201,7 @@ unsigned long long xen_sched_clock(void)
 unsigned long xen_cpu_khz(void)
 {
 	u64 xen_khz = 1000000ULL << 32;
-	const struct vcpu_time_info *info =
+	const struct pvclock_vcpu_time_info *info =
 		&HYPERVISOR_shared_info->vcpu_info[0].time;
 
 	do_div(xen_khz, info->tsc_to_system_mul);
@@ -230,121 +213,26 @@ unsigned long xen_cpu_khz(void)
 	return xen_khz;
 }
 
-/*
- * Reads a consistent set of time-base values from Xen, into a shadow data
- * area.
- */
-static unsigned get_time_values_from_xen(void)
-{
-	struct vcpu_time_info   *src;
-	struct shadow_time_info *dst;
-
-	/* src is shared memory with the hypervisor, so we need to
-	   make sure we get a consistent snapshot, even in the face of
-	   being preempted. */
-	src = &__get_cpu_var(xen_vcpu)->time;
-	dst = &__get_cpu_var(shadow_time);
-
-	do {
-		dst->version = src->version;
-		rmb();		/* fetch version before data */
-		dst->tsc_timestamp     = src->tsc_timestamp;
-		dst->system_timestamp  = src->system_time;
-		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-		dst->tsc_shift         = src->tsc_shift;
-		rmb();		/* test version after fetching data */
-	} while ((src->version & 1) | (dst->version ^ src->version));
-
-	return dst->version;
-}
-
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
-	u64 product;
-#ifdef __i386__
-	u32 tmp1, tmp2;
-#endif
-
-	if (shift < 0)
-		delta >>= -shift;
-	else
-		delta <<= shift;
-
-#ifdef __i386__
-	__asm__ (
-		"mul  %5       ; "
-		"mov  %4,%%eax ; "
-		"mov  %%edx,%4 ; "
-		"mul  %5       ; "
-		"xor  %5,%5    ; "
-		"add  %4,%%eax ; "
-		"adc  %5,%%edx ; "
-		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
-		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif __x86_64__
-	__asm__ (
-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
-		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
-	return product;
-}
-
-static u64 get_nsec_offset(struct shadow_time_info *shadow)
-{
-	u64 now, delta;
-	now = native_read_tsc();
-	delta = now - shadow->tsc_timestamp;
-	return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
-}
-
 static cycle_t xen_clocksource_read(void)
 {
-	struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+        struct pvclock_vcpu_time_info *src;
 	cycle_t ret;
-	unsigned version;
-
-	do {
-		version = get_time_values_from_xen();
-		barrier();
-		ret = shadow->system_timestamp + get_nsec_offset(shadow);
-		barrier();
-	} while (version != __get_cpu_var(xen_vcpu)->time.version);
-
-	put_cpu_var(shadow_time);
 
+	src = &get_cpu_var(xen_vcpu)->time;
+	ret = pvclock_clocksource_read(src);
+	put_cpu_var(xen_vcpu);
 	return ret;
 }
 
 static void xen_read_wallclock(struct timespec *ts)
 {
-	const struct shared_info *s = HYPERVISOR_shared_info;
-	u32 version;
-	u64 delta;
-	struct timespec now;
-
-	/* get wallclock at system boot */
-	do {
-		version = s->wc_version;
-		rmb();		/* fetch version before time */
-		now.tv_sec  = s->wc_sec;
-		now.tv_nsec = s->wc_nsec;
-		rmb();		/* fetch time before checking version */
-	} while ((s->wc_version & 1) | (version ^ s->wc_version));
-
-	delta = xen_clocksource_read();	/* time since system boot */
-	delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
-
-	now.tv_nsec = do_div(delta, NSEC_PER_SEC);
-	now.tv_sec = delta;
+	struct shared_info *s = HYPERVISOR_shared_info;
+	struct pvclock_wall_clock *wall_clock = &(s->wc);
+        struct pvclock_vcpu_time_info *vcpu_time;
 
-	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+	vcpu_time = &get_cpu_var(xen_vcpu)->time;
+	pvclock_read_wallclock(wall_clock, vcpu_time, ts);
+	put_cpu_var(xen_vcpu);
 }
 
 unsigned long xen_get_wallclock(void)
@@ -352,7 +240,6 @@ unsigned long xen_get_wallclock(void)
 	struct timespec ts;
 
 	xen_read_wallclock(&ts);
-
 	return ts.tv_sec;
 }
 
@@ -572,12 +459,23 @@ void xen_setup_cpu_clockevents(void)
 	clockevents_register_device(&__get_cpu_var(xen_clock_events));
 }
 
+void xen_timer_resume(void)
+{
+	int cpu;
+
+	if (xen_clockevent != &xen_vcpuop_clockevent)
+		return;
+
+	for_each_online_cpu(cpu) {
+		if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+			BUG();
+	}
+}
+
 __init void xen_time_init(void)
 {
 	int cpu = smp_processor_id();
 
-	get_time_values_from_xen();
-
 	clocksource_register(&xen_clocksource);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 288d587ce73c..7c0cf6320a0a 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <asm/boot.h>
 #include <xen/interface/elfnote.h>
+#include <asm/xen/interface.h>
 
 	__INIT
 ENTRY(startup_xen)
@@ -17,7 +18,7 @@ ENTRY(startup_xen)
 
 	__FINIT
 
-.pushsection .bss.page_aligned
+.pushsection .text
 	.align PAGE_SIZE_asm
 ENTRY(hypercall_page)
 	.skip 0x1000
@@ -30,11 +31,11 @@ ENTRY(hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long  startup_xen)
 	ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long  hypercall_page)
 	ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz "!writable_page_tables|pae_pgdir_above_4gb")
-#ifdef CONFIG_X86_PAE
 	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "yes")
-#else
-	ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz "no")
-#endif
 	ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz "generic")
+	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
+		.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
+	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   .long __HYPERVISOR_VIRT_START)
 
 #endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..9a055592a307 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -9,18 +9,26 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 
 extern struct start_info *xen_start_info;
+extern struct shared_info xen_dummy_shared_info;
 extern struct shared_info *HYPERVISOR_shared_info;
 
+void xen_setup_mfn_list_list(void);
+void xen_setup_shared_info(void);
+
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
 void __init xen_init_IRQ(void);
 void xen_enable_sysenter(void);
+void xen_vcpu_restore(void);
+
+void __init xen_build_dynamic_phys_to_machine(void);
 
 void xen_setup_timer(int cpu);
 void xen_setup_cpu_clockevents(void);
@@ -29,6 +37,7 @@ void __init xen_time_init(void);
 unsigned long xen_get_wallclock(void);
 int xen_set_wallclock(unsigned long time);
 unsigned long long xen_sched_clock(void);
+void xen_timer_resume(void);
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
@@ -54,6 +63,8 @@ int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
 			       void *info, int wait);
 
+extern cpumask_t xen_cpu_initialized_map;
+
 
 /* Declare an asm function, along with symbols needed to make it
    inlineable */