34 files changed, 801 insertions, 520 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c3920ea8ac56..90e092d0af0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,13 +22,14 @@ obj-y			+= setup_$(BITS).o i8259_$(BITS).o setup.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o setup64.o
-obj-y			+= pci-dma_$(BITS).o  bootflag.o e820_$(BITS).o
-obj-y			+= quirks.o i8237.o topology.o kdebugfs.o
-obj-y			+= alternative.o i8253.o
-obj-$(CONFIG_X86_64)	+= pci-nommu_64.o bugs_64.o
+obj-y			+= bootflag.o e820_$(BITS).o
+obj-y			+= pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
+obj-y			+= alternative.o i8253.o pci-nommu.o
+obj-$(CONFIG_X86_64)	+= bugs_64.o
 obj-y			+= tsc_$(BITS).o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-y				+= process.o
 obj-y				+= i387.o
 obj-y				+= ptrace.o
 obj-y				+= ds.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8ca3557a6d59..9366fb68d8d8 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -1,6 +1,4 @@
 /*
- * arch/i386/kernel/acpi/cstate.c
- *
  * Copyright (C) 2005 Intel Corporation
  * 	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
  * 	- Added _PDC for SMP C-states on Intel CPUs
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 324eb0cab19c..de2d2e4ebad9 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -1,6 +1,4 @@
 /*
- * arch/i386/kernel/acpi/processor.c
- *
  * Copyright (C) 2005 Intel Corporation
  * 	Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
  * 	- Added _PDC for platforms with Intel CPUs
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 14791ec55cfd..199e4e05e5dc 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -289,8 +289,8 @@ static int __init cpufreq_p4_init(void)
 	if (c->x86_vendor != X86_VENDOR_INTEL)
 		return -ENODEV;
 
-	if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) ||
-		!test_bit(X86_FEATURE_ACC, c->x86_capability))
+	if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
+				!test_cpu_cap(c, X86_FEATURE_ACC))
 		return -ENODEV;
 
 	ret = cpufreq_register_driver(&p4clockmod_driver);
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 9b7e01daa1ca..1f4cc48c14c6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,5 +1,4 @@
 /*
- * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
  *
  * Thermal throttle event support code (such as syslog messaging and rate
  * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 0240cd778365..ed733e7cf4e6 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -475,7 +475,7 @@ int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
 /*
  * Find the highest page frame number we have available
  */
-void __init find_max_pfn(void)
+void __init propagate_e820_map(void)
 {
 	int i;
 
@@ -704,7 +704,7 @@ static int __init parse_memmap(char *arg)
 		 * size before original memory map is
 		 * reset.
 		 */
-		find_max_pfn();
+		propagate_e820_map();
 		saved_max_pfn = max_pfn;
 #endif
 		e820.nr_map = 0;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 7f6c0c85c8f6..cbd42e51cb08 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -96,7 +96,7 @@ void __init early_res_to_bootmem(void)
 }
 
 /* Check for already reserved areas */
-static inline int
+static inline int __init
 bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
 {
 	int i;
@@ -116,7 +116,7 @@ again:
 }
 
 /* Check for already reserved areas */
-static inline int
+static inline int __init
 bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
 {
 	int i;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 759e02bec070..77d424cf68b3 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -383,6 +383,7 @@ static void __init runtime_code_page_mkexec(void)
 {
 	efi_memory_desc_t *md;
 	void *p;
+	u64 addr, npages;
 
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -391,7 +392,10 @@ static void __init runtime_code_page_mkexec(void)
 		if (md->type != EFI_RUNTIME_SERVICES_CODE)
 			continue;
 
-		set_memory_x(md->virt_addr, md->num_pages);
+		addr = md->virt_addr;
+		npages = md->num_pages;
+		memrange_efi_to_native(&addr, &npages);
+		set_memory_x(addr, npages);
 	}
 }
 
@@ -408,7 +412,7 @@ void __init efi_enter_virtual_mode(void)
 	efi_memory_desc_t *md;
 	efi_status_t status;
 	unsigned long size;
-	u64 end, systab;
+	u64 end, systab, addr, npages;
 	void *p, *va;
 
 	efi.systab = NULL;
@@ -420,7 +424,7 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		if ((end >> PAGE_SHIFT) <= max_pfn_mapped)
+		if (PFN_UP(end) <= max_pfn_mapped)
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
@@ -433,8 +437,12 @@ void __init efi_enter_virtual_mode(void)
 			continue;
 		}
 
-		if (!(md->attribute & EFI_MEMORY_WB))
-			set_memory_uc(md->virt_addr, md->num_pages);
+		if (!(md->attribute & EFI_MEMORY_WB)) {
+			addr = md->virt_addr;
+			npages = md->num_pages;
+			memrange_efi_to_native(&addr, &npages);
+			set_memory_uc(addr, npages);
+		}
 
 		systab = (u64) (unsigned long) efi_phys.systab;
 		if (md->phys_addr <= systab && systab < end) {
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d143a1e76b30..d0060fdcccac 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -105,14 +105,14 @@ void __init efi_reserve_bootmem(void)
 
 void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
-	static unsigned pages_mapped;
+	static unsigned pages_mapped __initdata;
 	unsigned i, pages;
+	unsigned long offset;
 
-	/* phys_addr and size must be page aligned */
-	if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK))
-		return NULL;
+	pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
+	offset = phys_addr & ~PAGE_MASK;
+	phys_addr &= PAGE_MASK;
 
-	pages = size >> PAGE_SHIFT;
 	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
 		return NULL;
 
@@ -124,5 +124,5 @@ void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
 	}
 
 	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
-					     (pages_mapped - pages));
+					     (pages_mapped - pages)) + offset;
 }
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9ba49a26dff8..f0f8934fc303 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/entry.S
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 5d77c9cd8e15..ebf13908a743 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -61,26 +61,31 @@ int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
 	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
 	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
 	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
-	    (6 << UVH_IPI_INT_DELIVERY_MODE_SHFT);
+	    APIC_DM_INIT;
+	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+	mdelay(10);
+
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	    (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	    (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
 	return 0;
 }
 
 static void uv_send_IPI_one(int cpu, int vector)
 {
-	unsigned long val, apicid;
+	unsigned long val, apicid, lapicid;
 	int nasid;
 
 	apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
+	lapicid = apicid & 0x3f;		/* ZZZ macro needed */
 	nasid = uv_apicid_to_nasid(apicid);
 	val =
-	    (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid <<
+	    (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
 					      UVH_IPI_INT_APIC_ID_SHFT) |
 	    (vector << UVH_IPI_INT_VECTOR_SHFT);
 	uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
-	printk(KERN_DEBUG
-	     "UV: IPI to cpu %d, apicid 0x%lx, vec %d, nasid%d, val 0x%lx\n",
-	     cpu, apicid, vector, nasid, val);
 }
 
 static void uv_send_IPI_mask(cpumask_t mask, int vector)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d6d54faa84df..993c76773256 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -146,6 +146,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 	reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
 
+#ifdef CONFIG_BLK_DEV_INITRD
 	/* Reserve INITRD */
 	if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
 		unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -153,6 +154,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 		unsigned long ramdisk_end   = ramdisk_image + ramdisk_size;
 		reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
 	}
+#endif
 
 	reserve_ebda_region();
 
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 826988a6e964..90f038af3adc 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -1,5 +1,4 @@
 /*
- *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 8f8102d967b3..db6839b53195 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -35,17 +35,18 @@
 #endif
 
 static unsigned int		mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int xstate_size;
+static struct i387_fxsave_struct fx_scratch __cpuinitdata;
 
-void mxcsr_feature_mask_init(void)
+void __cpuinit mxcsr_feature_mask_init(void)
 {
 	unsigned long mask = 0;
 
 	clts();
 	if (cpu_has_fxsr) {
-		memset(&current->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
-		mask = current->thread.i387.fxsave.mxcsr_mask;
+		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
+		asm volatile("fxsave %0" : : "m" (fx_scratch));
+		mask = fx_scratch.mxcsr_mask;
 		if (mask == 0)
 			mask = 0x0000ffbf;
 	}
@@ -53,6 +54,16 @@ void mxcsr_feature_mask_init(void)
 	stts();
 }
 
+void __init init_thread_xstate(void)
+{
+	if (cpu_has_fxsr)
+		xstate_size = sizeof(struct i387_fxsave_struct);
+#ifdef CONFIG_X86_32
+	else
+		xstate_size = sizeof(struct i387_fsave_struct);
+#endif
+}
+
 #ifdef CONFIG_X86_64
 /*
  * Called at bootup to set up the initial FPU state that is later cloned
@@ -61,10 +72,6 @@ void mxcsr_feature_mask_init(void)
 void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
-	extern void __bad_fxsave_alignment(void);
-
-	if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
-		__bad_fxsave_alignment();
 
 	set_in_cr4(X86_CR4_OSFXSR);
 	set_in_cr4(X86_CR4_OSXMMEXCPT);
@@ -84,32 +91,44 @@ void __cpuinit fpu_init(void)
  * value at reset if we support XMM instructions and then
  * remeber the current task has used the FPU.
  */
-void init_fpu(struct task_struct *tsk)
+int init_fpu(struct task_struct *tsk)
 {
 	if (tsk_used_math(tsk)) {
 		if (tsk == current)
 			unlazy_fpu(tsk);
-		return;
+		return 0;
+	}
+
+	/*
+	 * Memory allocation at the first usage of the FPU and other state.
+	 */
+	if (!tsk->thread.xstate) {
+		tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+						      GFP_KERNEL);
+		if (!tsk->thread.xstate)
+			return -ENOMEM;
 	}
 
 	if (cpu_has_fxsr) {
-		memset(&tsk->thread.i387.fxsave, 0,
-		       sizeof(struct i387_fxsave_struct));
-		tsk->thread.i387.fxsave.cwd = 0x37f;
+		struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+		memset(fx, 0, xstate_size);
+		fx->cwd = 0x37f;
 		if (cpu_has_xmm)
-			tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
+			fx->mxcsr = MXCSR_DEFAULT;
 	} else {
-		memset(&tsk->thread.i387.fsave, 0,
-		       sizeof(struct i387_fsave_struct));
-		tsk->thread.i387.fsave.cwd = 0xffff037fu;
-		tsk->thread.i387.fsave.swd = 0xffff0000u;
-		tsk->thread.i387.fsave.twd = 0xffffffffu;
-		tsk->thread.i387.fsave.fos = 0xffff0000u;
+		struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+		memset(fp, 0, xstate_size);
+		fp->cwd = 0xffff037fu;
+		fp->swd = 0xffff0000u;
+		fp->twd = 0xffffffffu;
+		fp->fos = 0xffff0000u;
 	}
 	/*
 	 * Only the device not available exception or ptrace can call init_fpu.
 	 */
 	set_stopped_child_used_math(tsk);
+	return 0;
 }
 
 int fpregs_active(struct task_struct *target, const struct user_regset *regset)
@@ -126,13 +145,17 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
 		unsigned int pos, unsigned int count,
 		void *kbuf, void __user *ubuf)
 {
+	int ret;
+
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	init_fpu(target);
+	ret = init_fpu(target);
+	if (ret)
+		return ret;
 
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.i387.fxsave, 0, -1);
+				   &target->thread.xstate->fxsave, 0, -1);
 }
 
 int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -144,16 +167,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!cpu_has_fxsr)
 		return -ENODEV;
 
-	init_fpu(target);
+	ret = init_fpu(target);
+	if (ret)
+		return ret;
+
 	set_stopped_child_used_math(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.i387.fxsave, 0, -1);
+				 &target->thread.xstate->fxsave, 0, -1);
 
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 
 	return ret;
 }
@@ -233,7 +259,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
 static void
 convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
 	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -273,7 +299,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
 			    const struct user_i387_ia32_struct *env)
 
 {
-	struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+	struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
 	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
 	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
 	int i;
@@ -302,15 +328,19 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
 	       void *kbuf, void __user *ubuf)
 {
 	struct user_i387_ia32_struct env;
+	int ret;
 
 	if (!HAVE_HWFP)
 		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
 
-	init_fpu(target);
+	ret = init_fpu(target);
+	if (ret)
+		return ret;
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.i387.fsave, 0, -1);
+					   &target->thread.xstate->fsave, 0,
+					   -1);
 	}
 
 	if (kbuf && pos == 0 && count == sizeof(env)) {
@@ -333,12 +363,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!HAVE_HWFP)
 		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
 
-	init_fpu(target);
+	ret = init_fpu(target);
+	if (ret)
+		return ret;
+
 	set_stopped_child_used_math(target);
 
 	if (!cpu_has_fxsr) {
 		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.i387.fsave, 0, -1);
+					  &target->thread.xstate->fsave, 0, -1);
 	}
 
 	if (pos > 0 || count < sizeof(env))
@@ -358,11 +391,11 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
+	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
 
 	unlazy_fpu(tsk);
-	tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
-	if (__copy_to_user(buf, &tsk->thread.i387.fsave,
-			   sizeof(struct i387_fsave_struct)))
+	fp->status = fp->swd;
+	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
 		return -1;
 	return 1;
 }
@@ -370,6 +403,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 {
 	struct task_struct *tsk = current;
+	struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
@@ -379,12 +413,12 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	if (__copy_to_user(buf, &env, sizeof(env)))
 		return -1;
 
-	err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+	err |= __put_user(fx->swd, &buf->status);
 	err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
 	if (err)
 		return -1;
 
-	if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+	if (__copy_to_user(&buf->_fxsr_env[0], fx,
 			   sizeof(struct i387_fxsave_struct)))
 		return -1;
 	return 1;
@@ -417,7 +451,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 	struct task_struct *tsk = current;
 
 	clear_fpu(tsk);
-	return __copy_from_user(&tsk->thread.i387.fsave, buf,
+	return __copy_from_user(&tsk->thread.xstate->fsave, buf,
 				sizeof(struct i387_fsave_struct));
 }
 
@@ -428,10 +462,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	int err;
 
 	clear_fpu(tsk);
-	err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
 			       sizeof(struct i387_fxsave_struct));
 	/* mxcsr reserved bits must be masked to zero for security reasons */
-	tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
 		return 1;
 	convert_to_fxsr(tsk, &env);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 24362ecf5f9a..f47f0eb886b8 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,11 +46,7 @@
 #include <asm/apicdef.h>
 #include <asm/system.h>
 
-#ifdef CONFIG_X86_32
-# include <mach_ipi.h>
-#else
-# include <asm/mach_apic.h>
-#endif
+#include <mach_ipi.h>
 
 /*
  * Put the error code here just in case the user cares:
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 8421d0ac6f22..11b14bbaa61e 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -321,7 +321,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 {
 
 	/*
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 11f9130ac513..5a29ded994fa 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -313,7 +313,8 @@ void touch_nmi_watchdog(void)
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
 
-int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
@@ -384,7 +385,8 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 
 static unsigned ignore_nmis;
 
-asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
 {
 	nmi_enter();
 	add_pda(__nmi_count,1);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1b5464c2434f..adb91e4b62da 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -470,10 +470,11 @@ error:
 	return 0;
 }
 
-static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
 	size_t size, int direction)
 {
 	dma_addr_t dma_handle = bad_dma_address;
+	void *vaddr = phys_to_virt(paddr);
 	unsigned long uaddr;
 	unsigned int npages;
 	struct iommu_table *tbl = find_iommu_table(dev);
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma.c
index ada5a0604992..388b113a7d88 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,61 +1,370 @@
-/*
- * Dynamic DMA mapping support.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
+#include <linux/dma-mapping.h>
 #include <linux/dmar.h>
-#include <asm/io.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
 #include <asm/gart.h>
 #include <asm/calgary.h>
 
-int iommu_merge __read_mostly = 0;
-
-dma_addr_t bad_dma_address __read_mostly;
-EXPORT_SYMBOL(bad_dma_address);
+int forbid_dac __read_mostly;
+EXPORT_SYMBOL(forbid_dac);
 
-/* This tells the BIO block layer to assume merging. Default to off
-   because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
+const struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
 
-static int iommu_sac_force __read_mostly = 0;
+int iommu_sac_force __read_mostly = 0;
 
-int no_iommu __read_mostly;
 #ifdef CONFIG_IOMMU_DEBUG
 int panic_on_overflow __read_mostly = 1;
 int force_iommu __read_mostly = 1;
 #else
 int panic_on_overflow __read_mostly = 0;
-int force_iommu __read_mostly= 0;
+int force_iommu __read_mostly = 0;
 #endif
 
+int iommu_merge __read_mostly = 0;
+
+int no_iommu __read_mostly;
 /* Set this to 1 if there is a HW IOMMU in the system */
 int iommu_detected __read_mostly = 0;
 
+/* This tells the BIO block layer to assume merging. Default to off
+   because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+dma_addr_t bad_dma_address __read_mostly = 0;
+EXPORT_SYMBOL(bad_dma_address);
+
 /* Dummy device used for NULL arguments (normally ISA). Better would
    be probably a smaller DMA mask, but this is bug-to-bug compatible
-   to i386. */
+   to older i386. */
 struct device fallback_dev = {
 	.bus_id = "fallback device",
 	.coherent_dma_mask = DMA_32BIT_MASK,
 	.dma_mask = &fallback_dev.coherent_dma_mask,
 };
 
+int dma_set_mask(struct device *dev, u64 mask)
+{
+	if (!dev->dma_mask || !dma_supported(dev, mask))
+		return -EIO;
+
+	*dev->dma_mask = mask;
+
+	return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+#ifdef CONFIG_X86_64
+static __initdata void *dma32_bootmem_ptr;
+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
+
+static int __init parse_dma32_size_opt(char *p)
+{
+	if (!p)
+		return -EINVAL;
+	dma32_bootmem_size = memparse(p, &p);
+	return 0;
+}
+early_param("dma32_size", parse_dma32_size_opt);
+
+void __init dma32_reserve_bootmem(void)
+{
+	unsigned long size, align;
+	if (end_pfn <= MAX_DMA32_PFN)
+		return;
+
+	align = 64ULL<<20;
+	size = round_up(dma32_bootmem_size, align);
+	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
+				 __pa(MAX_DMA_ADDRESS));
+	if (dma32_bootmem_ptr)
+		dma32_bootmem_size = size;
+	else
+		dma32_bootmem_size = 0;
+}
+static void __init dma32_free_bootmem(void)
+{
+	int node;
+
+	if (end_pfn <= MAX_DMA32_PFN)
+		return;
+
+	if (!dma32_bootmem_ptr)
+		return;
+
+	for_each_online_node(node)
+		free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
+				  dma32_bootmem_size);
+
+	dma32_bootmem_ptr = NULL;
+	dma32_bootmem_size = 0;
+}
+
+void __init pci_iommu_alloc(void)
+{
+	/* free the range so iommu could get some range less than 4G */
+	dma32_free_bootmem();
+	/*
+	 * The order of these functions is important for
+	 * fall-back/fail-over reasons
+	 */
+#ifdef CONFIG_GART_IOMMU
+	gart_iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+	detect_calgary();
+#endif
+
+	detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+	pci_swiotlb_init();
+#endif
+}
+#endif
+
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+	iommu_merge = 1;
+
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "off", 3))
+			no_iommu = 1;
+		/* gart_parse_options has more force support */
+		if (!strncmp(p, "force", 5))
+			force_iommu = 1;
+		if (!strncmp(p, "noforce", 7)) {
+			iommu_merge = 0;
+			force_iommu = 0;
+		}
+
+		if (!strncmp(p, "biomerge", 8)) {
+			iommu_bio_merge = 4096;
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "panic", 5))
+			panic_on_overflow = 1;
+		if (!strncmp(p, "nopanic", 7))
+			panic_on_overflow = 0;
+		if (!strncmp(p, "merge", 5)) {
+			iommu_merge = 1;
+			force_iommu = 1;
+		}
+		if (!strncmp(p, "nomerge", 7))
+			iommu_merge = 0;
+		if (!strncmp(p, "forcesac", 8))
+			iommu_sac_force = 1;
+		if (!strncmp(p, "allowdac", 8))
+			forbid_dac = 0;
+		if (!strncmp(p, "nodac", 5))
+			forbid_dac = -1;
+		if (!strncmp(p, "usedac", 6)) {
+			forbid_dac = -1;
+			return 1;
+		}
+#ifdef CONFIG_SWIOTLB
+		if (!strncmp(p, "soft", 4))
+			swiotlb = 1;
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+		gart_parse_options(p);
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+		if (!strncmp(p, "calgary", 7))
+			use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+		p += strcspn(p, ",");
+		if (*p == ',')
+			++p;
+	}
+	return 0;
+}
+early_param("iommu", iommu_setup);
+
+#ifdef CONFIG_X86_32
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+				dma_addr_t device_addr, size_t size, int flags)
+{
+	void __iomem *mem_base = NULL;
+	int pages = size >> PAGE_SHIFT;
+	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+		goto out;
+	if (!size)
+		goto out;
+	if (dev->dma_mem)
+		goto out;
+
+	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+	mem_base = ioremap(bus_addr, size);
+	if (!mem_base)
+		goto out;
+
+	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+	if (!dev->dma_mem)
+		goto out;
+	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+	if (!dev->dma_mem->bitmap)
+		goto free1_out;
+
+	dev->dma_mem->virt_base = mem_base;
+	dev->dma_mem->device_base = device_addr;
+	dev->dma_mem->size = pages;
+	dev->dma_mem->flags = flags;
+
+	if (flags & DMA_MEMORY_MAP)
+		return DMA_MEMORY_MAP;
+
+	return DMA_MEMORY_IO;
+
+ free1_out:
+	kfree(dev->dma_mem);
+ out:
+	if (mem_base)
+		iounmap(mem_base);
+	return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+
+	if (!mem)
+		return;
+	dev->dma_mem = NULL;
+	iounmap(mem->virt_base);
+	kfree(mem->bitmap);
+	kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+					dma_addr_t device_addr, size_t size)
+{
+	struct dma_coherent_mem *mem = dev->dma_mem;
+	int pos, err;
+	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+	pages >>= PAGE_SHIFT;
+
+	if (!mem)
+		return ERR_PTR(-EINVAL);
+
+	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+	if (err != 0)
+		return ERR_PTR(err);
+	return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
+				       dma_addr_t *dma_handle, void **ret)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+	int order = get_order(size);
+
+	if (mem) {
+		int page = bitmap_find_free_region(mem->bitmap, mem->size,
+						     order);
+		if (page >= 0) {
+			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
+			*ret = mem->virt_base + (page << PAGE_SHIFT);
+			memset(*ret, 0, size);
+		}
+		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+			*ret = NULL;
+	}
+	return (mem != NULL);
+}
+
+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
+{
+	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+	if (mem && vaddr >= mem->virt_base && vaddr <
+		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+		bitmap_release_region(mem->bitmap, page, order);
+		return 1;
+	}
+	return 0;
+}
+#else
+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
+#define dma_release_coherent(dev, order, vaddr) (0)
+#endif /* CONFIG_X86_32 */
+
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+	if (mask > 0xffffffff && forbid_dac > 0) {
+		printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
+				 dev->bus_id);
+		return 0;
+	}
+#endif
+
+	if (dma_ops->dma_supported)
+		return dma_ops->dma_supported(dev, mask);
+
+	/* Copied from i386. Doesn't make much sense, because it will
+	   only work for pci_alloc_coherent.
+	   The caller just has to use GFP_DMA in this case. */
+	if (mask < DMA_24BIT_MASK)
+		return 0;
+
+	/* Tell the device to use SAC when IOMMU force is on.  This
+	   allows the driver to use cheaper accesses in some cases.
+
+	   Problem with this is that if we overflow the IOMMU area and
+	   return DAC as fallback address the device may not handle it
+	   correctly.
+
+	   As a special case some controllers have a 39bit address
+	   mode that is as efficient as 32bit (aic79xx). Don't force
+	   SAC for these.  Assume all masks <= 40 bits are of this
+	   type. Normally this doesn't make any difference, but gives
+	   more gentle handling of IOMMU overflow. */
+	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+		printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
+				 dev->bus_id, mask);
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
 /* Allocate DMA memory on node near device */
-noinline static void *
+noinline struct page *
 dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
-	struct page *page;
 	int node;
 
 	node = dev_to_node(dev);
 
-	page = alloc_pages_node(node, gfp, order);
-	return page ? page_address(page) : NULL;
+	return alloc_pages_node(node, gfp, order);
 }
 
 /*
@@ -65,9 +374,16 @@ void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		   gfp_t gfp)
 {
-	void *memory;
+	void *memory = NULL;
+	struct page *page;
 	unsigned long dma_mask = 0;
-	u64 bus;
+	dma_addr_t bus;
+
+	/* ignore region specifiers */
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+	if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+		return memory;
 
 	if (!dev)
 		dev = &fallback_dev;
@@ -82,26 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	/* Don't invoke OOM killer */
 	gfp |= __GFP_NORETRY;
 
-	/* Kludge to make it bug-to-bug compatible with i386. i386
-	   uses the normal dma_mask for alloc_coherent. */
-	dma_mask &= *dev->dma_mask;
-
+#ifdef CONFIG_X86_64
 	/* Why <=? Even when the mask is smaller than 4GB it is often
 	   larger than 16MB and in this case we have a chance of
 	   finding fitting memory in the next higher zone first. If
 	   not retry with true GFP_DMA. -AK */
 	if (dma_mask <= DMA_32BIT_MASK)
 		gfp |= GFP_DMA32;
+#endif
 
  again:
-	memory = dma_alloc_pages(dev, gfp, get_order(size));
-	if (memory == NULL)
+	page = dma_alloc_pages(dev, gfp, get_order(size));
+	if (page == NULL)
 		return NULL;
 
 	{
 		int high, mmu;
-		bus = virt_to_bus(memory);
-	        high = (bus + size) >= dma_mask;
+		bus = page_to_phys(page);
+		memory = page_address(page);
+		high = (bus + size) >= dma_mask;
 		mmu = high;
 		if (force_iommu && !(gfp & GFP_DMA))
 			mmu = 1;
@@ -127,7 +442,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 
 		memset(memory, 0, size);
 		if (!mmu) {
-			*dma_handle = virt_to_bus(memory);
+			*dma_handle = bus;
 			return memory;
 		}
 	}
@@ -139,7 +454,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	}
 
 	if (dma_ops->map_simple) {
-		*dma_handle = dma_ops->map_simple(dev, memory,
+		*dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
 					      size,
 					      PCI_DMA_BIDIRECTIONAL);
 		if (*dma_handle != bad_dma_address)
@@ -147,7 +462,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
 	}
 
 	if (panic_on_overflow)
-		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
+		      (unsigned long)size);
 	free_pages((unsigned long)memory, get_order(size));
 	return NULL;
 }
@@ -160,153 +476,16 @@ EXPORT_SYMBOL(dma_alloc_coherent);
 void dma_free_coherent(struct device *dev, size_t size,
 			 void *vaddr, dma_addr_t bus)
 {
+	int order = get_order(size);
 	WARN_ON(irqs_disabled());	/* for portability */
+	if (dma_release_coherent(dev, order, vaddr))
+		return;
 	if (dma_ops->unmap_single)
 		dma_ops->unmap_single(dev, bus, size, 0);
-	free_pages((unsigned long)vaddr, get_order(size));
+	free_pages((unsigned long)vaddr, order);
 }
 EXPORT_SYMBOL(dma_free_coherent);
 
-static int forbid_dac __read_mostly;
-
-int dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
-	if (mask > 0xffffffff && forbid_dac > 0) {
-
-
-
-		printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
-		return 0;
-	}
-#endif
-
-	if (dma_ops->dma_supported)
-		return dma_ops->dma_supported(dev, mask);
-
-	/* Copied from i386. Doesn't make much sense, because it will
-	   only work for pci_alloc_coherent.
-	   The caller just has to use GFP_DMA in this case. */
-        if (mask < DMA_24BIT_MASK)
-                return 0;
-
-	/* Tell the device to use SAC when IOMMU force is on.  This
-	   allows the driver to use cheaper accesses in some cases.
-
-	   Problem with this is that if we overflow the IOMMU area and
-	   return DAC as fallback address the device may not handle it
-	   correctly.
-
-	   As a special case some controllers have a 39bit address
-	   mode that is as efficient as 32bit (aic79xx). Don't force
-	   SAC for these.  Assume all masks <= 40 bits are of this
-	   type. Normally this doesn't make any difference, but gives
-	   more gentle handling of IOMMU overflow. */
-	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
-		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
-		return 0;
-	}
-
-	return 1;
-}
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 mask)
-{
-	if (!dev->dma_mask || !dma_supported(dev, mask))
-		return -EIO;
-	*dev->dma_mask = mask;
-	return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-/*
- * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
- * documentation.
- */
-static __init int iommu_setup(char *p)
-{
-	iommu_merge = 1;
-
-	if (!p)
-		return -EINVAL;
-
-	while (*p) {
-		if (!strncmp(p, "off", 3))
-			no_iommu = 1;
-		/* gart_parse_options has more force support */
-		if (!strncmp(p, "force", 5))
-			force_iommu = 1;
-		if (!strncmp(p, "noforce", 7)) {
-			iommu_merge = 0;
-			force_iommu = 0;
-		}
-
-		if (!strncmp(p, "biomerge", 8)) {
-			iommu_bio_merge = 4096;
-			iommu_merge = 1;
-			force_iommu = 1;
-		}
-		if (!strncmp(p, "panic", 5))
-			panic_on_overflow = 1;
-		if (!strncmp(p, "nopanic", 7))
-			panic_on_overflow = 0;
-		if (!strncmp(p, "merge", 5)) {
-			iommu_merge = 1;
-			force_iommu = 1;
-		}
-		if (!strncmp(p, "nomerge", 7))
-			iommu_merge = 0;
-		if (!strncmp(p, "forcesac", 8))
-			iommu_sac_force = 1;
-		if (!strncmp(p, "allowdac", 8))
-			forbid_dac = 0;
-		if (!strncmp(p, "nodac", 5))
-			forbid_dac = -1;
-
-#ifdef CONFIG_SWIOTLB
-		if (!strncmp(p, "soft", 4))
-			swiotlb = 1;
-#endif
-
-#ifdef CONFIG_GART_IOMMU
-		gart_parse_options(p);
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
-		if (!strncmp(p, "calgary", 7))
-			use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
-		p += strcspn(p, ",");
-		if (*p == ',')
-			++p;
-	}
-	return 0;
-}
-early_param("iommu", iommu_setup);
-
-void __init pci_iommu_alloc(void)
-{
-	/*
-	 * The order of these functions is important for
-	 * fall-back/fail-over reasons
-	 */
-#ifdef CONFIG_GART_IOMMU
-	gart_iommu_hole_init();
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
-	detect_calgary();
-#endif
-
-	detect_intel_iommu();
-
-#ifdef CONFIG_SWIOTLB
-	pci_swiotlb_init();
-#endif
-}
-
 static int __init pci_iommu_init(void)
 {
 #ifdef CONFIG_CALGARY_IOMMU
@@ -327,6 +506,8 @@ void pci_iommu_shutdown(void)
 {
 	gart_iommu_shutdown();
 }
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
 
 #ifdef CONFIG_PCI
 /* Many VIA bridges seem to corrupt data for DAC. Disable it here */
@@ -334,11 +515,10 @@ void pci_iommu_shutdown(void)
 static __devinit void via_no_dac(struct pci_dev *dev)
 {
 	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+		printk(KERN_INFO "PCI: VIA PCI bridge detected."
+				 "Disabling DAC.\n");
 		forbid_dac = 1;
 	}
 }
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
 #endif
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
deleted file mode 100644
index 51330321a5d3..000000000000
--- a/arch/x86/kernel/pci-dma_32.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Dynamic DMA mapping support.
- *
- * On i386 there is no hardware dynamic DMA address translation,
- * so consistent alloc/free are merely page allocation/freeing.
- * The rest of the dynamic DMA mapping interface is implemented
- * in asm/pci.h.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <asm/io.h>
-
-struct dma_coherent_mem {
-	void		*virt_base;
-	u32		device_base;
-	int		size;
-	int		flags;
-	unsigned long	*bitmap;
-};
-
-void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, gfp_t gfp)
-{
-	void *ret;
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	int order = get_order(size);
-	/* ignore region specifiers */
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-
-	if (mem) {
-		int page = bitmap_find_free_region(mem->bitmap, mem->size,
-						     order);
-		if (page >= 0) {
-			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
-			ret = mem->virt_base + (page << PAGE_SHIFT);
-			memset(ret, 0, size);
-			return ret;
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-			return NULL;
-	}
-
-	if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
-		gfp |= GFP_DMA;
-
-	ret = (void *)__get_free_pages(gfp, order);
-
-	if (ret != NULL) {
-		memset(ret, 0, size);
-		*dma_handle = virt_to_phys(ret);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-void dma_free_coherent(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t dma_handle)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	int order = get_order(size);
-
-	WARN_ON(irqs_disabled());	/* for portability */
-	if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
-		bitmap_release_region(mem->bitmap, page, order);
-	} else
-		free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-				dma_addr_t device_addr, size_t size, int flags)
-{
-	void __iomem *mem_base = NULL;
-	int pages = size >> PAGE_SHIFT;
-	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
-	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-		goto out;
-	if (!size)
-		goto out;
-	if (dev->dma_mem)
-		goto out;
-
-	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
-	mem_base = ioremap(bus_addr, size);
-	if (!mem_base)
-		goto out;
-
-	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
-	if (!dev->dma_mem)
-		goto out;
-	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!dev->dma_mem->bitmap)
-		goto free1_out;
-
-	dev->dma_mem->virt_base = mem_base;
-	dev->dma_mem->device_base = device_addr;
-	dev->dma_mem->size = pages;
-	dev->dma_mem->flags = flags;
-
-	if (flags & DMA_MEMORY_MAP)
-		return DMA_MEMORY_MAP;
-
-	return DMA_MEMORY_IO;
-
- free1_out:
-	kfree(dev->dma_mem);
- out:
-	if (mem_base)
-		iounmap(mem_base);
-	return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	
-	if(!mem)
-		return;
-	dev->dma_mem = NULL;
-	iounmap(mem->virt_base);
-	kfree(mem->bitmap);
-	kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	int pos, err;
-
-	if (!mem)
-		return ERR_PTR(-EINVAL);
-
-	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
-	if (err != 0)
-		return ERR_PTR(err);
-	return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-int forbid_dac;
-EXPORT_SYMBOL(forbid_dac);
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
-		forbid_dac = 1;
-	}
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-
-static int check_iommu(char *s)
-{
-	if (!strcmp(s, "usedac")) {
-		forbid_dac = -1;
-		return 1;
-	}
-	return 0;
-}
-__setup("iommu=", check_iommu);
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 700e4647dd30..c07455d1695f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -264,9 +264,9 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 }
 
 static dma_addr_t
-gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
+gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
 {
-	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+	dma_addr_t map = dma_map_area(dev, paddr, size, dir);
 
 	flush_gart();
 
@@ -275,18 +275,17 @@ gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
 
 /* Map a single area into the IOMMU */
 static dma_addr_t
-gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
 {
-	unsigned long phys_mem, bus;
+	unsigned long bus;
 
 	if (!dev)
 		dev = &fallback_dev;
 
-	phys_mem = virt_to_phys(addr);
-	if (!need_iommu(dev, phys_mem, size))
-		return phys_mem;
+	if (!need_iommu(dev, paddr, size))
+		return paddr;
 
-	bus = gart_map_simple(dev, addr, size, dir);
+	bus = gart_map_simple(dev, paddr, size, dir);
 
 	return bus;
 }
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu.c
index ab08e1832228..aec43d56f49c 100644
--- a/arch/x86/kernel/pci-nommu_64.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
 static int
 check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
-        if (hwdev && bus + size > *hwdev->dma_mask) {
+	if (hwdev && bus + size > *hwdev->dma_mask) {
 		if (*hwdev->dma_mask >= DMA_32BIT_MASK)
 			printk(KERN_ERR
 			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -26,19 +26,17 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 }
 
 static dma_addr_t
-nommu_map_single(struct device *hwdev, void *ptr, size_t size,
+nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
 	       int direction)
 {
-	dma_addr_t bus = virt_to_bus(ptr);
+	dma_addr_t bus = paddr;
+	WARN_ON(size == 0);
 	if (!check_addr("map_single", hwdev, bus, size))
 				return bad_dma_address;
+	flush_write_buffers();
 	return bus;
 }
 
-static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
-			int direction)
-{
-}
 
 /* Map a set of buffers described by scatterlist in streaming
  * mode for DMA.  This is the scatter-gather version of the
@@ -61,30 +59,34 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	struct scatterlist *s;
 	int i;
 
+	WARN_ON(nents == 0 || sg[0].length == 0);
+
 	for_each_sg(sg, s, nents, i) {
 		BUG_ON(!sg_page(s));
-		s->dma_address = virt_to_bus(sg_virt(s));
+		s->dma_address = sg_phys(s);
 		if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
 			return 0;
 		s->dma_length = s->length;
 	}
+	flush_write_buffers();
 	return nents;
 }
 
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
-		  int nents, int dir)
+/* Make sure we keep the same behaviour */
+static int nommu_mapping_error(dma_addr_t dma_addr)
 {
+#ifdef CONFIG_X86_32
+	return 0;
+#else
+	return (dma_addr == bad_dma_address);
+#endif
 }
 
+
 const struct dma_mapping_ops nommu_dma_ops = {
 	.map_single = nommu_map_single,
-	.unmap_single = nommu_unmap_single,
 	.map_sg = nommu_map_sg,
-	.unmap_sg = nommu_unmap_sg,
+	.mapping_error = nommu_mapping_error,
 	.is_phys = 1,
 };
 
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82a0a674a003..490da7f4b8d0 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -11,11 +11,18 @@
 
 int swiotlb __read_mostly;
 
+static dma_addr_t
+swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
+			int direction)
+{
+	return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
+}
+
 const struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
-	.map_single = swiotlb_map_single,
+	.map_single = swiotlb_map_single_phys,
 	.unmap_single = swiotlb_unmap_single,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
new file mode 100644
index 000000000000..3004d716539d
--- /dev/null
+++ b/arch/x86/kernel/process.c
@@ -0,0 +1,44 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+struct kmem_cache *task_xstate_cachep;
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+	*dst = *src;
+	if (src->thread.xstate) {
+		dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+						      GFP_KERNEL);
+		if (!dst->thread.xstate)
+			return -ENOMEM;
+		WARN_ON((unsigned long)dst->thread.xstate & 15);
+		memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+	}
+	return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+	if (tsk->thread.xstate) {
+		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
+		tsk->thread.xstate = NULL;
+	}
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+	free_thread_xstate(ti->task);
+	free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+}
+
+void arch_task_cache_init(void)
+{
+        task_xstate_cachep =
+        	kmem_cache_create("task_xstate", xstate_size,
+				  __alignof__(union thread_xstate),
+				  SLAB_PANIC, NULL);
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 3903a8f2eb97..7adad088e373 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
+#include <linux/prctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -45,7 +46,6 @@
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/desc.h>
-#include <asm/vm86.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -521,14 +521,18 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	regs->cs		= __USER_CS;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
-#ifdef CONFIG_SECCOMP
 static void hard_disable_TSC(void)
 {
 	write_cr4(read_cr4() | X86_CR4_TSD);
 }
+
 void disable_TSC(void)
 {
 	preempt_disable();
@@ -540,11 +544,47 @@ void disable_TSC(void)
 		hard_disable_TSC();
 	preempt_enable();
 }
+
 static void hard_enable_TSC(void)
 {
 	write_cr4(read_cr4() & ~X86_CR4_TSD);
 }
-#endif /* CONFIG_SECCOMP */
+
+void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
 
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
@@ -578,7 +618,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		set_debugreg(next->debugreg7, 7);
 	}
 
-#ifdef CONFIG_SECCOMP
 	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
 	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
 		/* prev and next are different */
@@ -587,7 +626,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		else
 			hard_enable_TSC();
 	}
-#endif
 
 #ifdef X86_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
@@ -669,7 +707,7 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
 
 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter > 5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);
 
 	/*
 	 * Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e75ccc8a2b87..891af1a1b48a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -36,6 +36,7 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/tick.h>
+#include <linux/prctl.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -532,9 +533,71 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	regs->ss		= __USER_DS;
 	regs->flags		= 0x200;
 	set_fs(USER_DS);
+	/*
+	 * Free the old FP and other extended state
+	 */
+	free_thread_xstate(current);
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
+static void hard_disable_TSC(void)
+{
+	write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_disable_TSC();
+	preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+	write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+void enable_TSC(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOTSC))
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOTSC in the current running context.
+		 */
+		hard_enable_TSC();
+	preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+	unsigned int val;
+
+	if (test_thread_flag(TIF_NOTSC))
+		val = PR_TSC_SIGSEGV;
+	else
+		val = PR_TSC_ENABLE;
+
+	return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+	if (val == PR_TSC_SIGSEGV)
+		disable_TSC();
+	else if (val == PR_TSC_ENABLE)
+		enable_TSC();
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * This special macro can be used to load a debugging register
  */
@@ -572,6 +635,15 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		loaddebug(next, 7);
 	}
 
+	if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+	    test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+		/* prev and next are different */
+		if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+			hard_disable_TSC();
+		else
+			hard_enable_TSC();
+	}
+
 	if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Copy the relevant range of the IO bitmap.
@@ -614,7 +686,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/* we're going to use this soon, after a few expensive things */
 	if (next_p->fpu_counter>5)
-		prefetch(&next->i387.fxsave);
+		prefetch(next->xstate);
 
 	/*
 	 * Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index 9042fb0e36f5..aee0e8200777 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -74,8 +74,8 @@ int force_personality32 = 0;
 Control non executable heap for 32bit processes.
 To control the stack too use noexec=off
 
-on	PROT_READ does not imply PROT_EXEC for 32bit processes
-off	PROT_READ implies PROT_EXEC (default)
+on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off	PROT_READ implies PROT_EXEC
 */
 static int __init nonx32_setup(char *str)
 {
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 5b0bffb7fcc9..1c4799e68718 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -812,10 +812,10 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 
 	/* update e820 for memory not covered by WB MTRRs */
-	find_max_pfn();
+	propagate_e820_map();
 	mtrr_bp_init();
 	if (mtrr_trim_uncached_memory(max_pfn))
-		find_max_pfn();
+		propagate_e820_map();
 
 	max_low_pfn = setup_memory();
 
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 674ef3510cdf..6b8e11f0c15d 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -398,6 +398,8 @@ void __init setup_arch(char **cmdline_p)
 
 	early_res_to_bootmem();
 
+	dma32_reserve_bootmem();
+
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
@@ -420,11 +422,14 @@ void __init setup_arch(char **cmdline_p)
 		unsigned long end_of_mem    = end_pfn << PAGE_SHIFT;
 
 		if (ramdisk_end <= end_of_mem) {
-			reserve_bootmem_generic(ramdisk_image, ramdisk_size);
+			/*
+			 * don't need to reserve again, already reserved early
+			 * in x86_64_start_kernel, and early_res_to_bootmem
+			 * convert that to reserved in bootmem
+			 */
 			initrd_start = ramdisk_image + PAGE_OFFSET;
 			initrd_end = initrd_start+ramdisk_size;
 		} else {
-			/* Assumes everything on node 0 */
 			free_bootmem(ramdisk_image, ramdisk_size);
 			printk(KERN_ERR "initrd extends beyond end of memory "
 			       "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index e6abe8a49b1f..6a925394bc7e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -61,6 +61,7 @@
 #include <asm/mtrr.h>
 #include <asm/nmi.h>
 #include <asm/vmi.h>
+#include <asm/genapic.h>
 #include <linux/mc146818rtc.h>
 
 #include <mach_apic.h>
@@ -677,6 +678,12 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	unsigned long send_status, accept_status = 0;
 	int maxlvt, num_starts, j;
 
+	if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
+		send_status = uv_wakeup_secondary(phys_apicid, start_eip);
+		atomic_set(&init_deasserted, 1);
+		return send_status;
+	}
+
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 */
@@ -918,16 +925,19 @@ do_rest:
 
 	atomic_set(&init_deasserted, 0);
 
-	Dprintk("Setting warm reset code and vector.\n");
+	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 
-	store_NMI_vector(&nmi_high, &nmi_low);
+		Dprintk("Setting warm reset code and vector.\n");
 
-	smpboot_setup_warm_reset_vector(start_ip);
-	/*
-	 * Be paranoid about clearing APIC errors.
-	 */
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
+		store_NMI_vector(&nmi_high, &nmi_low);
+
+		smpboot_setup_warm_reset_vector(start_ip);
+		/*
+		 * Be paranoid about clearing APIC errors.
+	 	*/
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
 
 	/*
 	 * Starting actual IPI sequence...
@@ -966,7 +976,8 @@ do_rest:
 			else
 				/* trampoline code not run */
 				printk(KERN_ERR "Not responding.\n");
-			inquire_remote_apic(apicid);
+			if (get_uv_system_type() != UV_NON_UNIQUE_APIC)
+				inquire_remote_apic(apicid);
 		}
 	}
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index 65791ca2824a..471e694d6713 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -681,7 +681,7 @@ gp_in_kernel:
 	}
 }
 
-static __kprobes void
+static notrace __kprobes void
 mem_parity_error(unsigned char reason, struct pt_regs *regs)
 {
 	printk(KERN_EMERG
@@ -707,7 +707,7 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
 	clear_mem_error(reason);
 }
 
-static __kprobes void
+static notrace __kprobes void
 io_check_error(unsigned char reason, struct pt_regs *regs)
 {
 	unsigned long i;
@@ -727,7 +727,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	outb(reason, 0x61);
 }
 
-static __kprobes void
+static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
 	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -755,7 +755,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
 static DEFINE_SPINLOCK(nmi_print_lock);
 
-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 {
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
 		return;
@@ -786,7 +786,7 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
 	do_exit(SIGSEGV);
 }
 
-static __kprobes void default_do_nmi(struct pt_regs *regs)
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 
@@ -828,7 +828,7 @@ static __kprobes void default_do_nmi(struct pt_regs *regs)
 
 static int ignore_nmis;
 
-__kprobes void do_nmi(struct pt_regs *regs, long error_code)
+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
 {
 	int cpu;
 
@@ -1148,9 +1148,22 @@ asmlinkage void math_state_restore(void)
 	struct thread_info *thread = current_thread_info();
 	struct task_struct *tsk = thread->task;
 
+	if (!tsk_used_math(tsk)) {
+		local_irq_enable();
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(tsk)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+		local_irq_disable();
+	}
+
 	clts();				/* Allow maths ops (or we recurse) */
-	if (!tsk_used_math(tsk))
-		init_fpu(tsk);
 	restore_fpu(tsk);
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
@@ -1208,11 +1221,6 @@ void __init trap_init(void)
 #endif
 	set_trap_gate(19, &simd_coprocessor_error);
 
-	/*
-	 * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
-	 * Generate a build-time error if the alignment is wrong.
-	 */
-	BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
 	if (cpu_has_fxsr) {
 		printk(KERN_INFO "Enabling fast FPU save and restore... ");
 		set_in_cr4(X86_CR4_OSFXSR);
@@ -1233,6 +1241,7 @@ void __init trap_init(void)
 
 	set_bit(SYSCALL_VECTOR, used_vectors);
 
+	init_thread_xstate();
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 79aa6fc0815c..adff76ea97c4 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -600,7 +600,8 @@ void die(const char * str, struct pt_regs * regs, long err)
 	oops_end(flags, regs, SIGSEGV);
 }
 
-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
 {
 	unsigned long flags;
 
@@ -772,7 +773,7 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
 	die("general protection fault", regs, error_code);
 }
 
-static __kprobes void
+static notrace __kprobes void
 mem_parity_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
@@ -796,7 +797,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
 	outb(reason, 0x61);
 }
 
-static __kprobes void
+static notrace __kprobes void
 io_check_error(unsigned char reason, struct pt_regs * regs)
 {
 	printk("NMI: IOCK error (debug interrupt?)\n");
@@ -810,7 +811,7 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
 	outb(reason, 0x61);
 }
 
-static __kprobes void
+static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 {
 	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -827,7 +828,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
 
 /* Runs on IST stack. This code must keep interrupts off all the time.
    Nested NMIs are prevented by the CPU. */
-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace  __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
 	int cpu;
@@ -1123,11 +1124,24 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
 asmlinkage void math_state_restore(void)
 {
 	struct task_struct *me = current;
-	clts();			/* Allow maths ops (or we recurse) */
 
-	if (!used_math())
-		init_fpu(me);
-	restore_fpu_checking(&me->thread.i387.fxsave);
+	if (!used_math()) {
+		local_irq_enable();
+		/*
+		 * does a slab alloc which can sleep
+		 */
+		if (init_fpu(me)) {
+			/*
+			 * ran out of memory!
+			 */
+			do_group_exit(SIGKILL);
+			return;
+		}
+		local_irq_disable();
+	}
+
+	clts();			/* Allow maths ops (or we recurse) */
+	restore_fpu_checking(&me->thread.xstate->fxsave);
 	task_thread_info(me)->status |= TS_USEDFPU;
 	me->fpu_counter++;
 }
@@ -1163,6 +1177,10 @@ void __init trap_init(void)
 #endif
        
 	/*
+	 * initialize the per thread extended state:
+	 */
+        init_thread_xstate();
+	/*
 	 * Should be a barrier for any external CPU state.
 	 */
 	cpu_init();
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 3d7e6e9fa6c2..e4790728b224 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -221,9 +221,9 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
  * if the CPU frequency is scaled, TSC-based delays will need a different
  * loops_per_jiffy value to function properly.
  */
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-static unsigned long cpu_khz_ref = 0;
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long cpu_khz_ref;
 
 static int
 time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
@@ -283,15 +283,28 @@ core_initcall(cpufreq_tsc);
 
 /* clock source code */
 
-static unsigned long current_tsc_khz = 0;
+static unsigned long current_tsc_khz;
+static struct clocksource clocksource_tsc;
 
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp issue. This can be observed in
+ * a very small window right after one CPU updated cycle_last under
+ * xtime lock and the other CPU reads a TSC value which is smaller
+ * than the cycle_last reference value due to a TSC which is slighty
+ * behind. This delta is nowhere else observable, but in that case it
+ * results in a forward time jump in the range of hours due to the
+ * unsigned delta calculation of the time keeping core code, which is
+ * necessary to support wrapping clocksources like pm timer.
+ */
 static cycle_t read_tsc(void)
 {
 	cycle_t ret;
 
 	rdtscll(ret);
 
-	return ret;
+	return ret >= clocksource_tsc.cycle_last ?
+		ret : clocksource_tsc.cycle_last;
 }
 
 static struct clocksource clocksource_tsc = {
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index ceeba01e7f47..fcc16e58609e 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -11,6 +11,7 @@
 #include <asm/hpet.h>
 #include <asm/timex.h>
 #include <asm/timer.h>
+#include <asm/vgtod.h>
 
 static int notsc __initdata = 0;
 
@@ -287,18 +288,34 @@ int __init notsc_setup(char *s)
 
 __setup("notsc", notsc_setup);
 
+static struct clocksource clocksource_tsc;
 
-/* clock source code: */
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ */
 static cycle_t read_tsc(void)
 {
 	cycle_t ret = (cycle_t)get_cycles();
-	return ret;
+
+	return ret >= clocksource_tsc.cycle_last ?
+		ret : clocksource_tsc.cycle_last;
 }
 
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)vget_cycles();
-	return ret;
+
+	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
+		ret : __vsyscall_gtod_data.clock.cycle_last;
 }
 
 static struct clocksource clocksource_tsc = {