38 files changed, 716 insertions, 266 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..6527a368fb9c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT
 	default "7" if GENERIC_CPU || MPSC
 	default "6" if MK8
 
+config X86_INTERNODE_CACHE_BYTES
+	int
+	default "4096" if X86_VSMP
+	default X86_L1_CACHE_BYTES if !X86_VSMP
+
 config X86_TSC
 	bool
 	default y
@@ -250,6 +255,15 @@ config SCHED_SMT
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
 source "kernel/Kconfig.preempt"
 
 config NUMA
@@ -274,12 +288,18 @@ config K8_NUMA
 	 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
 	 instead, which also takes priority if both are compiled in.   
 
+config NODES_SHIFT
+	int
+	default "6"
+	depends on NEED_MULTIPLE_NODES
+
 # Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
 
 config X86_64_ACPI_NUMA
        bool "ACPI NUMA detection"
        depends on NUMA
        select ACPI 
+	select PCI
        select ACPI_NUMA
        default y
        help
@@ -325,6 +345,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool y
 	depends on NUMA
 
+config OUT_OF_LINE_PFN_TO_PAGE
+	def_bool y
+	depends on DISCONTIGMEM
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-256)"
 	range 2 255
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 585fd4a559c8..e573e2ab5510 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -24,37 +24,37 @@
 LDFLAGS		:= -m elf_x86_64
 OBJCOPYFLAGS	:= -O binary -R .note -R .comment -S
 LDFLAGS_vmlinux :=
-
 CHECKFLAGS      += -D__x86_64__ -m64
 
+cflags-y	:=
 cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
 cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
 cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
-CFLAGS += $(cflags-y)
 
-CFLAGS += -m64
-CFLAGS += -mno-red-zone
-CFLAGS += -mcmodel=kernel
-CFLAGS += -pipe
+cflags-y += -m64
+cflags-y += -mno-red-zone
+cflags-y += -mcmodel=kernel
+cflags-y += -pipe
 cflags-$(CONFIG_REORDER) += -ffunction-sections
 # this makes reading assembly source easier, but produces worse code
 # actually it makes the kernel smaller too.
-CFLAGS += -fno-reorder-blocks	
-CFLAGS += -Wno-sign-compare
+cflags-y += -fno-reorder-blocks
+cflags-y += -Wno-sign-compare
 ifneq ($(CONFIG_UNWIND_INFO),y)
-CFLAGS += -fno-asynchronous-unwind-tables
+cflags-y += -fno-asynchronous-unwind-tables
 endif
 ifneq ($(CONFIG_DEBUG_INFO),y)
 # -fweb shrinks the kernel a bit, but the difference is very small
 # it also messes up debugging, so don't use it for now.
-#CFLAGS += $(call cc-option,-fweb)
+#cflags-y += $(call cc-option,-fweb)
 endif
 # -funit-at-a-time shrinks the kernel .text considerably
 # unfortunately it makes reading oopses harder.
-CFLAGS += $(call cc-option,-funit-at-a-time)
+cflags-y += $(call cc-option,-funit-at-a-time)
 # prevent gcc from generating any FP code by mistake
-CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
 
+CFLAGS += $(cflags-y)
 AFLAGS += -m64
 
 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
index 0587477c99f2..32327bb37aff 100644
--- a/arch/x86_64/boot/video.S
+++ b/arch/x86_64/boot/video.S
@@ -97,6 +97,7 @@
 #define PARAM_VESAPM_OFF	0x30
 #define PARAM_LFB_PAGES		0x32
 #define PARAM_VESA_ATTRIB	0x34
+#define PARAM_CAPABILITIES	0x36
 
 /* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
 #ifdef CONFIG_VIDEO_RETAIN
@@ -233,6 +234,10 @@ mopar_gr:
 	movw	18(%di), %ax
 	movl	%eax, %fs:(PARAM_LFB_SIZE)
 
+# store mode capabilities
+	movl 10(%di), %eax
+	movl %eax, %fs:(PARAM_CAPABILITIES)
+
 # switching the DAC to 8-bit is for <= 8 bpp only
 	movw	%fs:(PARAM_LFB_DEPTH), %ax
 	cmpw	$8, %ax
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 566ecc97ee5a..69db0c0721d1 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16-git9
-# Sat Mar 25 15:18:40 2006
+# Linux kernel version: 2.6.17-rc1-git11
+# Sun Apr 16 07:22:36 2006
 #
 CONFIG_X86_64=y
 CONFIG_64BIT=y
@@ -9,6 +9,7 @@ CONFIG_X86=y
 CONFIG_SEMAPHORE_SLEEPERS=y
 CONFIG_MMU=y
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_HWEIGHT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_X86_CMPXCHG=y
 CONFIG_EARLY_PRINTK=y
@@ -55,11 +56,8 @@ CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
 CONFIG_EPOLL=y
 CONFIG_SHMEM=y
-CONFIG_CC_ALIGN_FUNCTIONS=0
-CONFIG_CC_ALIGN_LABELS=0
-CONFIG_CC_ALIGN_LOOPS=0
-CONFIG_CC_ALIGN_JUMPS=0
 CONFIG_SLAB=y
+CONFIG_DOUBLEFAULT=y
 # CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
 # CONFIG_SLOB is not set
@@ -70,7 +68,6 @@ CONFIG_BASE_SMALL=0
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_OBSOLETE_MODPARM=y
 # CONFIG_MODVERSIONS is not set
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 # CONFIG_KMOD is not set
@@ -81,6 +78,7 @@ CONFIG_STOP_MACHINE=y
 #
 CONFIG_LBD=y
 # CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_LSF is not set
 
 #
 # IO Schedulers
@@ -105,6 +103,7 @@ CONFIG_X86_PC=y
 CONFIG_GENERIC_CPU=y
 CONFIG_X86_L1_CACHE_BYTES=128
 CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
 CONFIG_X86_TSC=y
 CONFIG_X86_GOOD_APIC=y
 # CONFIG_MICROCODE is not set
@@ -116,12 +115,14 @@ CONFIG_X86_LOCAL_APIC=y
 CONFIG_MTRR=y
 CONFIG_SMP=y
 CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
 # CONFIG_PREEMPT_NONE is not set
 CONFIG_PREEMPT_VOLUNTARY=y
 # CONFIG_PREEMPT is not set
 CONFIG_PREEMPT_BKL=y
 CONFIG_NUMA=y
 CONFIG_K8_NUMA=y
+CONFIG_NODES_SHIFT=6
 CONFIG_X86_64_ACPI_NUMA=y
 CONFIG_NUMA_EMU=y
 CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
@@ -138,6 +139,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_MIGRATION=y
 CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
 CONFIG_NR_CPUS=32
 CONFIG_HOTPLUG_CPU=y
 CONFIG_HPET_TIMER=y
@@ -289,6 +291,7 @@ CONFIG_IP_PNP_DHCP=y
 # CONFIG_INET_AH is not set
 # CONFIG_INET_ESP is not set
 # CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
 # CONFIG_INET_TUNNEL is not set
 CONFIG_INET_DIAG=y
 CONFIG_INET_TCP_DIAG=y
@@ -300,6 +303,7 @@ CONFIG_IPV6=y
 # CONFIG_INET6_AH is not set
 # CONFIG_INET6_ESP is not set
 # CONFIG_INET6_IPCOMP is not set
+# CONFIG_INET6_XFRM_TUNNEL is not set
 # CONFIG_INET6_TUNNEL is not set
 # CONFIG_IPV6_TUNNEL is not set
 # CONFIG_NETFILTER is not set
@@ -542,7 +546,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
 # CONFIG_SCSI_INIA100 is not set
 # CONFIG_SCSI_SYM53C8XX_2 is not set
 # CONFIG_SCSI_IPR is not set
-# CONFIG_SCSI_QLOGIC_FC is not set
 # CONFIG_SCSI_QLOGIC_1280 is not set
 # CONFIG_SCSI_QLA_FC is not set
 # CONFIG_SCSI_LPFC is not set
@@ -704,7 +707,6 @@ CONFIG_S2IO=m
 # Wireless LAN (non-hamradio)
 #
 # CONFIG_NET_RADIO is not set
-# CONFIG_NET_WIRELESS_RTNETLINK is not set
 
 #
 # Wan interfaces
@@ -791,7 +793,7 @@ CONFIG_HW_CONSOLE=y
 #
 CONFIG_SERIAL_8250=y
 CONFIG_SERIAL_8250_CONSOLE=y
-# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_PCI=y
 CONFIG_SERIAL_8250_NR_UARTS=4
 CONFIG_SERIAL_8250_RUNTIME_UARTS=4
 # CONFIG_SERIAL_8250_EXTENDED is not set
@@ -921,6 +923,7 @@ CONFIG_HWMON=y
 # Digital Video Broadcasting Devices
 #
 # CONFIG_DVB is not set
+# CONFIG_USB_DABUSB is not set
 
 #
 # Graphics support
@@ -932,6 +935,8 @@ CONFIG_VIDEO_SELECT=y
 # Console display driver support
 #
 CONFIG_VGA_CONSOLE=y
+CONFIG_VGACON_SOFT_SCROLLBACK=y
+CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
 CONFIG_DUMMY_CONSOLE=y
 
 #
@@ -1041,9 +1046,7 @@ CONFIG_USB_HIDINPUT=y
 # CONFIG_USB_ACECAD is not set
 # CONFIG_USB_KBTAB is not set
 # CONFIG_USB_POWERMATE is not set
-# CONFIG_USB_MTOUCH is not set
-# CONFIG_USB_ITMTOUCH is not set
-# CONFIG_USB_EGALAX is not set
+# CONFIG_USB_TOUCHSCREEN is not set
 # CONFIG_USB_YEALINK is not set
 # CONFIG_USB_XPAD is not set
 # CONFIG_USB_ATI_REMOTE is not set
@@ -1058,15 +1061,6 @@ CONFIG_USB_HIDINPUT=y
 # CONFIG_USB_MICROTEK is not set
 
 #
-# USB Multimedia devices
-#
-# CONFIG_USB_DABUSB is not set
-
-#
-# Video4Linux support is needed for USB Multimedia device support
-#
-
-#
 # USB Network Adapters
 #
 # CONFIG_USB_CATC is not set
@@ -1118,9 +1112,23 @@ CONFIG_USB_MON=y
 # CONFIG_MMC is not set
 
 #
+# LED devices
+#
+# CONFIG_NEW_LEDS is not set
+
+#
+# LED drivers
+#
+
+#
+# LED Triggers
+#
+
+#
 # InfiniBand support
 #
 # CONFIG_INFINIBAND is not set
+# CONFIG_IPATH_CORE is not set
 
 #
 # EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1128,6 +1136,11 @@ CONFIG_USB_MON=y
 # CONFIG_EDAC is not set
 
 #
+# Real Time Clock
+#
+# CONFIG_RTC_CLASS is not set
+
+#
 # Firmware Drivers
 #
 # CONFIG_EDD is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index 929e6b0771f8..e9263b4975e0 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
 	$(call if_changed,syscall)
 
-AFLAGS_vsyscall-sysenter.o = -m32
-AFLAGS_vsyscall-syscall.o = -m32
+AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
+AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index e776139afb20..926c4743d13b 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -339,7 +339,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
 	struct mm_struct *mm = current->mm;
 	int i, ret;
 
-	stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
+	stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
 	mm->arg_start = bprm->p + stack_base;
 
 	bprm->p += stack_base;
@@ -357,7 +357,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
 	{
 		mpnt->vm_mm = mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
-		mpnt->vm_end = IA32_STACK_TOP;
+		mpnt->vm_end = stack_top;
 		if (executable_stack == EXSTACK_ENABLE_X)
 			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
 		else if (executable_stack == EXSTACK_DISABLE_X)
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 7549a4389fbf..5a92fed2d1d5 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -15,6 +15,8 @@
 #include <asm/vsyscall32.h>
 #include <linux/linkage.h>
 
+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
+
 	.macro IA32_ARG_FIXUP noebp=0
 	movl	%edi,%r8d
 	.if \noebp
@@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 sysenter_do_call:	
-	cmpl	$(IA32_NR_syscalls),%eax
-	jae	ia32_badsys
+	cmpl	$(IA32_NR_syscalls-1),%eax
+	ja	ia32_badsys
 	IA32_ARG_FIXUP 1
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX-ARGOFFSET(%rsp)
@@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 cstar_do_call:	
-	cmpl $IA32_NR_syscalls,%eax
-	jae  ia32_badsys
+	cmpl $IA32_NR_syscalls-1,%eax
+	ja  ia32_badsys
 	IA32_ARG_FIXUP 1
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX-ARGOFFSET(%rsp)
@@ -296,8 +298,8 @@ ENTRY(ia32_syscall)
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
 	jnz ia32_tracesys
 ia32_do_syscall:	
-	cmpl $(IA32_NR_syscalls),%eax
-	jae  ia32_badsys
+	cmpl $(IA32_NR_syscalls-1),%eax
+	ja  ia32_badsys
 	IA32_ARG_FIXUP
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
@@ -685,10 +687,13 @@ ia32_sys_call_table:
 	.quad sys_readlinkat		/* 305 */
 	.quad sys_fchmodat
 	.quad sys_faccessat
-	.quad sys_ni_syscall		/* pselect6 for now */
-	.quad sys_ni_syscall		/* ppoll for now */
+	.quad quiet_ni_syscall		/* pselect6 for now */
+	.quad quiet_ni_syscall		/* ppoll for now */
 	.quad sys_unshare		/* 310 */
+	.quad compat_sys_set_robust_list
+	.quad compat_sys_get_robust_list
+	.quad sys_splice
+	.quad sys_sync_file_range
+	.quad sys_tee
+	.quad compat_sys_vmsplice
 ia32_syscall_end:		
-	.rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
-		.quad ni_syscall
-	.endr
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
index d90321fe9bba..1384367cdbe1 100644
--- a/arch/x86_64/ia32/vsyscall-sigreturn.S
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -32,9 +32,28 @@ __kernel_rt_sigreturn:
 	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
 
 	.section .eh_frame,"a",@progbits
+.LSTARTFRAMES:
+        .long .LENDCIES-.LSTARTCIES
+.LSTARTCIES:
+	.long 0			/* CIE ID */
+	.byte 1			/* Version number */
+	.string "zRS"		/* NUL-terminated augmentation string */
+	.uleb128 1		/* Code alignment factor */
+	.sleb128 -4		/* Data alignment factor */
+	.byte 8			/* Return address register column */
+	.uleb128 1		/* Augmentation value length */
+	.byte 0x1b		/* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+	.byte 0x0c		/* DW_CFA_def_cfa */
+	.uleb128 4
+	.uleb128 4
+	.byte 0x88		/* DW_CFA_offset, column 0x8 */
+	.uleb128 1
+	.align 4
+.LENDCIES:
+
 	.long .LENDFDE2-.LSTARTFDE2	/* Length FDE */
 .LSTARTFDE2:
-	.long .LSTARTFDE2-.LSTARTFRAME	/* CIE pointer */
+	.long .LSTARTFDE2-.LSTARTFRAMES	/* CIE pointer */
 	/* HACK: The dwarf2 unwind routines will subtract 1 from the
 	   return address to get an address in the middle of the
 	   presumed call instruction.  Since we didn't get here via
@@ -97,7 +116,7 @@ __kernel_rt_sigreturn:
 
 	.long .LENDFDE3-.LSTARTFDE3	/* Length FDE */
 .LSTARTFDE3:
-	.long .LSTARTFDE3-.LSTARTFRAME	/* CIE pointer */
+	.long .LSTARTFDE3-.LSTARTFRAMES	/* CIE pointer */
 	/* HACK: See above wrt unwind library assumptions.  */
 	.long .LSTART_rt_sigreturn-1-.	/* PC-relative start address */
 	.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index a098a11e7755..059c88313f4e 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
-		dmi_scan.o pci-dma.o pci-nommu.o
+		pci-dma.o pci-nommu.o
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -49,5 +49,3 @@ intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
 i8237-y				+= ../../i386/kernel/i8237.o
 msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o
-dmi_scan-y			+= ../../i386/kernel/dmi_scan.o
-
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index fffd6b0a2fab..70b9d21ed675 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
 		printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
 		return 0; 
 	}
-	if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {  
+	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
 		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
 		return 0; 
 	} 
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index d54620147e8e..100a30c40044 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -615,7 +615,7 @@ static int __init apic_set_verbosity(char *str)
 		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
 				" use apic=verbose or apic=debug", str);
 
-	return 0;
+	return 1;
 }
 
 __setup("apic=", apic_set_verbosity);
@@ -1137,35 +1137,35 @@ int __init APIC_init_uniprocessor (void)
 static __init int setup_disableapic(char *str) 
 { 
 	disable_apic = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_nolapic(char *str) 
 { 
 	disable_apic = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_noapictimer(char *str) 
 { 
 	if (str[0] != ' ' && str[0] != 0)
-		return -1;
+		return 0;
 	disable_apic_timer = 1;
-	return 0;
+	return 1;
 } 
 
 static __init int setup_apicmaintimer(char *str)
 {
 	apic_runs_main_timer = 1;
 	nohpet = 1;
-	return 0;
+	return 1;
 }
 __setup("apicmaintimer", setup_apicmaintimer);
 
 static __init int setup_noapicmaintimer(char *str)
 {
 	apic_runs_main_timer = -1;
-	return 0;
+	return 1;
 }
 __setup("noapicmaintimer", setup_noapicmaintimer);
 
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 293cd71a266a..1ef6028f721e 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -76,11 +76,22 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
 		*addrp = __pa_symbol(&_end);
 		return 1;
 	}
+
+	if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
+		*addrp = ebda_addr + ebda_size;
+		return 1;
+	}
+
 	/* XXX ramdisk image here? */ 
 	return 0;
 } 
 
-int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) 
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int __meminit
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
 { 
 	int i;
 	for (i = 0; i < e820.nr_map; i++) { 
@@ -94,6 +105,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
 	return 0;
 }
 
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+	int i;
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		if (type && ei->type != type)
+			continue;
+		/* is the region (part) in overlap with the current region ?*/
+		if (ei->addr >= end || ei->addr + ei->size <= start)
+			continue;
+
+		/* if the region is at the beginning of <start,end> we move
+		 * start to the end of the region since it's ok until there
+		 */
+		if (ei->addr <= start)
+			start = ei->addr + ei->size;
+		/* if start is now at or beyond end, we're done, full coverage */
+		if (start >= end)
+			return 1; /* we're done */
+	}
+	return 0;
+}
+
 /* 
  * Find a free area in a specific range. 
  */ 
@@ -109,7 +149,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
 			addr = start;
 		if (addr > ei->addr + ei->size) 
 			continue; 
-		while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
+		while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
 			;
 		last = addr + size;
 		if (last > ei->addr + ei->size)
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 13af920b6594..b93ef5b51980 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -221,7 +221,7 @@ int __init setup_early_printk(char *opt)
 	char buf[256];
 
 	if (early_console_initialized)
-		return -1;
+		return 1;
 
 	strlcpy(buf,opt,sizeof(buf));
 	space = strchr(buf, ' ');
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 8538bfea30e6..586b34c00c48 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -180,6 +180,10 @@ rff_trace:
  *
  * XXX	if we had a free scratch register we could save the RSP into the stack frame
  *      and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
  */ 			 		
 
 ENTRY(system_call)
@@ -254,7 +258,10 @@ sysret_signal:
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
 1:	movl $_TIF_NEED_RESCHED,%edi
-	jmp sysret_check
+	/* Use IRET because user could have changed frame. This
+	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+	cli
+	jmp int_with_check
 	
 badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -274,13 +281,9 @@ tracesys:
 	ja  1f
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
-1:	SAVE_REST
-	movq %rsp,%rdi
-	call syscall_trace_leave
-	RESTORE_TOP_OF_STACK %rbx
-	RESTORE_REST
-	jmp ret_from_sys_call
+1:	movq %rax,RAX-ARGOFFSET(%rsp)
+	/* Use IRET because user could have changed frame */
+	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 		
 /* 
@@ -408,25 +411,9 @@ ENTRY(stub_execve)
 	CFI_ADJUST_CFA_OFFSET -8
 	CFI_REGISTER rip, r11
 	SAVE_REST
-	movq %r11, %r15
-	CFI_REGISTER rip, r15
 	FIXUP_TOP_OF_STACK %r11
 	call sys_execve
-	GET_THREAD_INFO(%rcx)
-	bt $TIF_IA32,threadinfo_flags(%rcx)
-	CFI_REMEMBER_STATE
-	jc exec_32bit
 	RESTORE_TOP_OF_STACK %r11
-	movq %r15, %r11
-	CFI_REGISTER rip, r11
-	RESTORE_REST
-	pushq %r11
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rip, 0
-	ret
-
-exec_32bit:
-	CFI_RESTORE_STATE
 	movq %rax,RAX(%rsp)
 	RESTORE_REST
 	jmp int_ret_from_sys_call
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 77b4c608cca0..9cc7031b7151 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -271,6 +271,18 @@ __setup("enable_8254_timer", setup_enable_8254_timer);
 #include <linux/pci_ids.h>
 #include <linux/pci.h>
 
+
+#ifdef CONFIG_ACPI
+
+static int nvidia_hpet_detected __initdata;
+
+static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
+{
+	nvidia_hpet_detected = 1;
+	return 0;
+}
+#endif
+
 /* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
    off. Check for an Nvidia or VIA PCI bridge and turn it off.
    Use pci direct infrastructure because this runs before the PCI subsystem. 
@@ -317,11 +329,19 @@ void __init check_ioapic(void)
 					return;
 				case PCI_VENDOR_ID_NVIDIA:
 #ifdef CONFIG_ACPI
-					/* All timer overrides on Nvidia
-				           seem to be wrong. Skip them. */
-					acpi_skip_timer_override = 1;
-					printk(KERN_INFO 
-	     "Nvidia board detected. Ignoring ACPI timer override.\n");
+					/*
+					 * All timer overrides on Nvidia are
+					 * wrong unless HPET is enabled.
+					 */
+					nvidia_hpet_detected = 0;
+					acpi_table_parse(ACPI_HPET,
+							nvidia_hpet_check);
+					if (nvidia_hpet_detected == 0) {
+						acpi_skip_timer_override = 1;
+						printk(KERN_INFO "Nvidia board "
+						    "detected. Ignoring ACPI "
+						    "timer override.\n");
+					}
 #endif
 					/* RED-PEN skip them on mptables too? */
 					return;
@@ -1777,6 +1797,8 @@ static inline void unlock_ExtINT_logic(void)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+int timer_uses_ioapic_pin_0;
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -1814,6 +1836,9 @@ static inline void check_timer(void)
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
+	if (pin1 == 0)
+		timer_uses_ioapic_pin_0 = 1;
+
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index accbff3fec49..fa1d19ca700a 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -53,7 +53,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 /*
  * returns non-zero if opcode modifies the interrupt flag.
  */
-static inline int is_IF_modifier(kprobe_opcode_t *insn)
+static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
 {
 	switch (*insn) {
 	case 0xfa:		/* cli */
@@ -84,7 +84,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
  * If it does, return the address of the 32-bit displacement word.
  * If not, return null.
  */
-static inline s32 *is_riprel(u8 *insn)
+static s32 __kprobes *is_riprel(u8 *insn)
 {
 #define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \
 	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
@@ -229,7 +229,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
 	mutex_unlock(&kprobe_mutex);
 }
 
-static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	kcb->prev_kprobe.kp = kprobe_running();
 	kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -237,7 +237,7 @@ static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
 	kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
 }
 
-static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
 	kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -245,7 +245,7 @@ static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 	kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
 }
 
-static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 				struct kprobe_ctlblk *kcb)
 {
 	__get_cpu_var(current_kprobe) = p;
@@ -514,13 +514,13 @@ static void __kprobes resume_execution(struct kprobe *p,
 		*tos = orig_rip + (*tos - copy_rip);
 		break;
 	case 0xff:
-		if ((*insn & 0x30) == 0x10) {
+		if ((insn[1] & 0x30) == 0x10) {
 			/* call absolute, indirect */
 			/* Fix return addr; rip is correct. */
 			next_rip = regs->rip;
 			*tos = orig_rip + (*tos - copy_rip);
-		} else if (((*insn & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
-			   ((*insn & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
+		} else if (((insn[1] & 0x31) == 0x20) ||	/* jmp near, absolute indirect */
+			   ((insn[1] & 0x31) == 0x21)) {	/* jmp far, absolute indirect */
 			/* rip is correct. */
 			next_rip = regs->rip;
 		}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 04282ef9fbd4..c69fc43cee7b 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -29,6 +29,8 @@
 #define MISC_MCELOG_MINOR 227
 #define NR_BANKS 6
 
+atomic_t mce_entry;
+
 static int mce_dont_init;
 
 /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
@@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	int i;
 	int panicm_found = 0;
 
+	atomic_inc(&mce_entry);
+
 	if (regs)
 		notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
 	if (!banks)
-		return;
+		goto out2;
 
 	memset(&m, 0, sizeof(struct mce));
 	m.cpu = safe_smp_processor_id();
@@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  out:
 	/* Last thing done in the machine check exception to clear state. */
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ out2:
+	atomic_dec(&mce_entry);
 }
 
 /*
@@ -501,7 +507,7 @@ static struct miscdevice mce_log_device = {
 static int __init mcheck_disable(char *str)
 {
 	mce_dont_init = 1;
-	return 0;
+	return 1;
 }
 
 /* mce=off disables machine check. Note you can reenable it later
@@ -521,7 +527,7 @@ static int __init mcheck_enable(char *str)
 		get_option(&str, &tolerant);
 	else
 		printk("mce= argument %s ignored. Please use /sys", str); 
-	return 0;
+	return 1;
 }
 
 __setup("nomce", mcheck_disable);
@@ -623,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 #endif
 
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
+static int
 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d3ad7d81266d..d13b241ad094 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -482,7 +482,7 @@ static void threshold_remove_device(unsigned int cpu)
 #endif
 
 /* get notified when a cpu comes on/off */
-static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
+static int threshold_cpu_callback(struct notifier_block *nfb,
 					    unsigned long action, void *hcpu)
 {
 	/* cpu was unsigned int to begin with */
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index b17cf3eba359..083da7e606b1 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -968,7 +968,17 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 		 */
 		int irq = gsi;
 		if (gsi < MAX_GSI_NUM) {
-			if (gsi > 15)
+			/*
+			 * Retain the VIA chipset work-around (gsi > 15), but
+			 * avoid a problem where the 8254 timer (IRQ0) is setup
+			 * via an override (so it's not on pin 0 of the ioapic),
+			 * and at the same time, the pin 0 interrupt is a PCI
+			 * type.  The gsi > 15 test could cause these two pins
+			 * to be shared as IRQ0, and they are not shareable.
+			 * So test for this condition, and if necessary, avoid
+			 * the pin collision.
+			 */
+			if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
 				gsi = pci_irq++;
 			/*
 			 * Don't assign IRQ used by ACPI SCI
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index d9e4067faf05..4e6357fe0ec3 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -34,6 +34,7 @@
 #include <asm/proto.h>
 #include <asm/kdebug.h>
 #include <asm/local.h>
+#include <asm/mce.h>
 
 /*
  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
 	}
+#ifdef CONFIG_X86_MCE
+	/* Could check oops_in_progress here too, but it's safer
+	   not too */
+	if (atomic_read(&mce_entry) > 0)
+		touched = 1;
+#endif
 	if (!touched && __get_cpu_var(last_irq_sum) == sum) {
 		/*
 		 * Ayiee, looks like this CPU is stuck ...
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 03c9eeedb0f3..a9275c9557cf 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -48,10 +48,16 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
 {
 	struct page *page;
 	int node;
+#ifdef CONFIG_PCI
 	if (dev->bus == &pci_bus_type)
 		node = pcibus_to_node(to_pci_dev(dev)->bus);
 	else
+#endif
 		node = numa_node_id();
+
+	if (node < first_node(node_online_map))
+		node = first_node(node_online_map);
+
 	page = alloc_pages_node(node, gfp, order);
 	return page ? page_address(page) : NULL;
 }
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index a6c01e121266..82a7c9bfdfa0 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -112,10 +112,6 @@ static unsigned long alloc_iommu(int size)
 static void free_iommu(unsigned long offset, int size)
 { 
 	unsigned long flags;
-	if (size == 1) { 
-		clear_bit(offset, iommu_gart_bitmap); 
-		return;
-	}
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	__clear_bit_string(iommu_gart_bitmap, offset, size);
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -635,14 +631,20 @@ static int __init pci_iommu_init(void)
 		printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
 		if (end_pfn > MAX_DMA32_PFN) {
 			printk(KERN_ERR "WARNING more than 4GB of memory "
-					"but IOMMU not compiled in.\n"
-			       KERN_ERR "WARNING 32bit PCI may malfunction.\n"
-			       KERN_ERR "You might want to enable "
-					"CONFIG_GART_IOMMU\n");
+					"but IOMMU not available.\n"
+			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");
 		}
 		return -1;
 	}
 
+	i = 0;
+	for_all_nb(dev)
+		i++;
+	if (i > MAX_NB) {
+		printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i);
+		return -1;
+	}
+
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
 	aper_size = info.aper_size * 1024 * 1024;	
 	iommu_size = check_iommu_size(info.aper_base, aper_size); 
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 44adcc2d5e5b..1f6ecc62061d 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -12,9 +12,10 @@ static int
 check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
         if (hwdev && bus + size > *hwdev->dma_mask) {
-		printk(KERN_ERR
-		    "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
-	       name, (long long)bus, size, (long long)*hwdev->dma_mask);
+		if (*hwdev->dma_mask >= 0xffffffffULL)
+			printk(KERN_ERR
+			    "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
+	       			name, (long long)bus, size, (long long)*hwdev->dma_mask);
 		return 0;
 	}
 	return 1;
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index ee5ee4891f3d..bf421ed26808 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -68,7 +68,7 @@ int pmtimer_mark_offset(void)
 	offset_delay = delta % (USEC_PER_SEC / HZ);
 
 	rdtscll(tsc);
-	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
+	vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000;
 
 	/* don't calculate delay for first run,
 	   or if we've got less then a tick */
@@ -121,7 +121,7 @@ unsigned int do_gettimeoffset_pm(void)
 static int __init nopmtimer_setup(char *s)
 {
 	pmtmr_ioport = 0;
-	return 0;
+	return 1;
 }
 
 __setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0370720515f1..fb903e65e079 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override);
 void (*pm_idle)(void);
 static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
 
-static struct notifier_block *idle_notifier;
-static DEFINE_SPINLOCK(idle_notifier_lock);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 
 void idle_notifier_register(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_register(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_register(&idle_notifier, n);
 }
 EXPORT_SYMBOL_GPL(idle_notifier_register);
 
 void idle_notifier_unregister(struct notifier_block *n)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&idle_notifier_lock, flags);
-	notifier_chain_unregister(&idle_notifier, n);
-	spin_unlock_irqrestore(&idle_notifier_lock, flags);
+	atomic_notifier_chain_unregister(&idle_notifier, n);
 }
 EXPORT_SYMBOL(idle_notifier_unregister);
 
@@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
 void enter_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 }
 
 static void __exit_idle(void)
 {
 	__get_cpu_var(idle_state) = CPU_NOT_IDLE;
-	notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+	atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 }
 
 /* Called from interrupts to signify idle end */
@@ -582,8 +575,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	prev->userrsp = read_pda(oldrsp); 
 	write_pda(oldrsp, next->userrsp); 
 	write_pda(pcurrent, next_p); 
+
 	/* This must be here to ensure both math_state_restore() and
-	   kernel_fpu_begin() work consistently. */
+	   kernel_fpu_begin() work consistently. 
+	   And the AMD workaround requires it to be after DS reload. */
 	unlazy_fpu(prev_p);
 	write_pda(kernelstack,
 		  task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
@@ -788,10 +783,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 	}
 	case ARCH_GET_GS: { 
 		unsigned long base;
+		unsigned gsindex;
 		if (task->thread.gsindex == GS_TLS_SEL)
 			base = read_32bit_tls(task, GS_TLS);
-		else if (doit)
-			rdmsrl(MSR_KERNEL_GS_BASE, base);
+		else if (doit) {
+ 			asm("movl %%gs,%0" : "=r" (gsindex));
+			if (gsindex)
+				rdmsrl(MSR_KERNEL_GS_BASE, base);
+			else
+				base = task->thread.gs;
+		}
 		else
 			base = task->thread.gs;
 		ret = put_user(base, (unsigned long __user *)addr); 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index d44b2c1e63a6..2d50024c9f30 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -274,11 +274,6 @@ static int putreg(struct task_struct *child,
 				return -EIO;
 			value &= 0xffff;
 			break;
-		case offsetof(struct user_regs_struct, rip):
-			/* Check if the new RIP address is canonical */
-			if (value >= TASK_SIZE_OF(child))
-				return -EIO;
-			break;
 	}
 	put_stack_long(child, regno - sizeof(struct pt_regs), value);
 	return 0;
@@ -605,12 +600,12 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 
 	if (unlikely(current->audit_context)) {
 		if (test_thread_flag(TIF_IA32)) {
-			audit_syscall_entry(current, AUDIT_ARCH_I386,
+			audit_syscall_entry(AUDIT_ARCH_I386,
 					    regs->orig_rax,
 					    regs->rbx, regs->rcx,
 					    regs->rdx, regs->rsi);
 		} else {
-			audit_syscall_entry(current, AUDIT_ARCH_X86_64,
+			audit_syscall_entry(AUDIT_ARCH_X86_64,
 					    regs->orig_rax,
 					    regs->rdi, regs->rsi,
 					    regs->rdx, regs->r10);
@@ -621,7 +616,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
 	if (unlikely(current->audit_context))
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax);
+		audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
 
 	if ((test_thread_flag(TIF_SYSCALL_TRACE)
 	     || test_thread_flag(TIF_SINGLESTEP))
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..655b9192eeb3 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
 		if (fullarg(from, "enable_timer_pin_1"))
 			disable_timer_pin_1 = -1;
 
-		if (fullarg(from, "nolapic") || fullarg(from, "disableapic"))
+		if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
+			clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
 			disable_apic = 1;
+		}
 
 		if (fullarg(from, "noapic"))
 			skip_ioapic_setup = 1;
@@ -540,7 +542,7 @@ void __init alternative_instructions(void)
 static int __init noreplacement_setup(char *s)
 { 
      no_replacement = 1; 
-     return 0; 
+     return 1;
 } 
 
 __setup("noreplacement", noreplacement_setup); 
@@ -569,17 +571,28 @@ static inline void copy_edd(void)
 #endif
 
 #define EBDA_ADDR_POINTER 0x40E
-static void __init reserve_ebda_region(void)
+
+unsigned __initdata ebda_addr;
+unsigned __initdata ebda_size;
+
+static void discover_ebda(void)
 {
-	unsigned int addr;
-	/** 
+	/*
 	 * there is a real-mode segmented pointer pointing to the 
 	 * 4K EBDA area at 0x40E
 	 */
-	addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
-	addr <<= 4;
-	if (addr)
-		reserve_bootmem_generic(addr, PAGE_SIZE);
+	ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
+	ebda_addr <<= 4;
+
+	ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
+
+	/* Round EBDA up to pages */
+	if (ebda_size == 0)
+		ebda_size = 1;
+	ebda_size <<= 10;
+	ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+	if (ebda_size > 64*1024)
+		ebda_size = 64*1024;
 }
 
 void __init setup_arch(char **cmdline_p)
@@ -625,6 +638,8 @@ void __init setup_arch(char **cmdline_p)
 
 	check_efer();
 
+	discover_ebda();
+
 	init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
 
 	dmi_scan_machine();
@@ -667,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
 	reserve_bootmem_generic(0, PAGE_SIZE);
 
 	/* reserve ebda region */
-	reserve_ebda_region();
+	if (ebda_addr)
+		reserve_bootmem_generic(ebda_addr, ebda_size);
 
 #ifdef CONFIG_SMP
 	/*
@@ -928,6 +944,10 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
 		set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
 
+	/* Enable workaround for FXSAVE leak */
+	if (c->x86 >= 6)
+		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+
 	r = get_model_name(c);
 	if (!r) { 
 		switch (c->x86) { 
@@ -962,7 +982,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 
 	cpuid(1, &eax, &ebx, &ecx, &edx);
 
-	c->apicid = phys_pkg_id(0);
 
 	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
 		return;
@@ -1032,7 +1051,7 @@ static void srat_detect_node(void)
 	   for now. */
 	node = apicid_to_node[hard_smp_processor_id()];
 	if (node == NUMA_NO_NODE)
-		node = 0;
+		node = first_node(node_online_map);
 	numa_set_node(cpu, node);
 
 	if (acpi_numa > 0)
@@ -1171,6 +1190,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			c->x86_capability[2] = cpuid_edx(0x80860001);
 	}
 
+	c->apicid = phys_pkg_id(0);
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
@@ -1419,3 +1440,22 @@ struct seq_operations cpuinfo_op = {
 	.show =	show_cpuinfo,
 };
 
+#ifdef CONFIG_INPUT_PCSPKR
+#include <linux/platform_device.h>
+static __init int add_pcspkr(void)
+{
+	struct platform_device *pd;
+	int ret;
+
+	pd = platform_device_alloc("pcspkr", -1);
+	if (!pd)
+		return -ENOMEM;
+
+	ret = platform_device_add(pd);
+	if (ret)
+		platform_device_put(pd);
+
+	return ret;
+}
+device_initcall(add_pcspkr);
+#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index eabdb63fec31..8a691fa6d393 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -55,7 +55,7 @@ int __init nonx_setup(char *str)
 		do_not_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
         }
-	return 0;
+	return 1;
 } 
 __setup("noexec=", nonx_setup);	/* parsed early actually */
 
@@ -74,7 +74,7 @@ static int __init nonx32_setup(char *str)
 		force_personality32 &= ~READ_IMPLIES_EXEC;
 	else if (!strcmp(str, "off"))
 		force_personality32 |= READ_IMPLIES_EXEC;
-	return 0;
+	return 1;
 }
 __setup("noexec32=", nonx32_setup);
 
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..71a7222cf9ce 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 /* core ID of each logical CPU */
 u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID};
+
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map __read_mostly;
 
@@ -350,7 +353,7 @@ static void __cpuinit tsc_sync_wait(void)
 static __init int notscsync_setup(char *s)
 {
 	notscsync = 1;
-	return 0;
+	return 1;
 }
 __setup("notscsync", notscsync_setup);
 
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+	/*
+	 * For perf, we return last level cache shared map.
+	 * TBD: when power saving sched policy is added, we will return
+	 *      cpu_core_map when power saving policy is enabled
+	 */
+	return c->llc_shared_map;
+}
+
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
 				cpu_set(cpu, cpu_sibling_map[i]);
 				cpu_set(i, cpu_core_map[cpu]);
 				cpu_set(cpu, cpu_core_map[i]);
+				cpu_set(i, c[cpu].llc_shared_map);
+				cpu_set(cpu, c[i].llc_shared_map);
 			}
 		}
 	} else {
 		cpu_set(cpu, cpu_sibling_map[cpu]);
 	}
 
+	cpu_set(cpu, c[cpu].llc_shared_map);
+
 	if (current_cpu_data.x86_max_cores == 1) {
 		cpu_core_map[cpu] = cpu_sibling_map[cpu];
 		c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
 	}
 
 	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		if (cpu_llc_id[cpu] != BAD_APICID &&
+		    cpu_llc_id[cpu] == cpu_llc_id[i]) {
+			cpu_set(i, c[cpu].llc_shared_map);
+			cpu_set(cpu, c[i].llc_shared_map);
+		}
 		if (phys_proc_id[cpu] == phys_proc_id[i]) {
 			cpu_set(i, cpu_core_map[cpu]);
 			cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7f58fa682491..7392570f975d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -504,42 +504,25 @@ unsigned long long sched_clock(void)
 
 static unsigned long get_cmos_time(void)
 {
-	unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
-	unsigned char uip = 0, this = 0;
+	unsigned int year, mon, day, hour, min, sec;
 	unsigned long flags;
 	unsigned extyear = 0;
 
-/*
- * The Linux interpretation of the CMOS clock register contents: When the
- * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
- * second which has precisely just started. Waiting for this can take up to 1
- * second, we timeout approximately after 2.4 seconds on a machine with
- * standard 8.3 MHz ISA bus.
- */
-
 	spin_lock_irqsave(&rtc_lock, flags);
 
-	while (timeout && (!uip || this)) {
-		uip |= this;
-		this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
-		timeout--;
-	}
-
-	/*
-	 * Here we are safe to assume the registers won't change for a whole
-	 * second, so we just go ahead and read them.
- 	 */
-	sec = CMOS_READ(RTC_SECONDS);
-	min = CMOS_READ(RTC_MINUTES);
-	hour = CMOS_READ(RTC_HOURS);
-	day = CMOS_READ(RTC_DAY_OF_MONTH);
-	mon = CMOS_READ(RTC_MONTH);
-	year = CMOS_READ(RTC_YEAR);
-
+	do {
+		sec = CMOS_READ(RTC_SECONDS);
+		min = CMOS_READ(RTC_MINUTES);
+		hour = CMOS_READ(RTC_HOURS);
+		day = CMOS_READ(RTC_DAY_OF_MONTH);
+		mon = CMOS_READ(RTC_MONTH);
+		year = CMOS_READ(RTC_YEAR);
 #ifdef CONFIG_ACPI
-	if (acpi_fadt.revision >= FADT2_REVISION_ID && acpi_fadt.century)
-		extyear = CMOS_READ(acpi_fadt.century);
+		if (acpi_fadt.revision >= FADT2_REVISION_ID &&
+					acpi_fadt.century)
+			extyear = CMOS_READ(acpi_fadt.century);
 #endif
+	} while (sec != CMOS_READ(RTC_SECONDS));
 
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
@@ -743,7 +726,7 @@ static __init int late_hpet_init(void)
 	unsigned int 		ntimer;
 
 	if (!vxtime.hpet_address)
-        	return -1;
+        	return 0;
 
 	memset(&hd, 0, sizeof (hd));
 
@@ -934,6 +917,8 @@ void __init time_init(void)
 		vxtime.hpet_address = 0;
 
 	if (hpet_use_timer) {
+		/* set tick_nsec to use the proper rate for HPET */
+	  	tick_nsec = TICK_NSEC_HPET;
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
 #ifdef CONFIG_X86_PM_TIMER
@@ -1323,7 +1308,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 static int __init nohpet_setup(char *s) 
 { 
 	nohpet = 1;
-	return 0;
+	return 1;
 } 
 
 __setup("nohpet", nohpet_setup);
@@ -1331,7 +1316,7 @@ __setup("nohpet", nohpet_setup);
 int __init notsc_setup(char *s)
 {
 	notsc = 1;
-	return 0;
+	return 1;
 }
 
 __setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7b148309c529..cea335e8746c 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,6 +30,7 @@
 #include <linux/moduleparam.h>
 #include <linux/nmi.h>
 #include <linux/kprobes.h>
+#include <linux/kexec.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -69,20 +70,20 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);
 
-struct notifier_block *die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-
 	vmalloc_sync_all();
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
+EXPORT_SYMBOL(unregister_die_notifier);
 
 static inline void conditional_sti(struct pt_regs *regs)
 {
@@ -101,6 +102,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
 	if (regs->eflags & X86_EFLAGS_IF)
 		local_irq_disable();
+	/* Make sure to not schedule here because we could be running
+	   on an exception stack. */
 	preempt_enable_no_resched();
 }
 
@@ -384,6 +387,7 @@ void out_of_line_bug(void)
 
 static DEFINE_SPINLOCK(die_lock);
 static int die_owner = -1;
+static unsigned int die_nest_count;
 
 unsigned __kprobes long oops_begin(void)
 {
@@ -398,6 +402,7 @@ unsigned __kprobes long oops_begin(void)
 		else
 			spin_lock(&die_lock);
 	}
+	die_nest_count++;
 	die_owner = cpu;
 	console_verbose();
 	bust_spinlocks(1);
@@ -408,7 +413,13 @@ void __kprobes oops_end(unsigned long flags)
 { 
 	die_owner = -1;
 	bust_spinlocks(0);
-	spin_unlock_irqrestore(&die_lock, flags);
+	die_nest_count--;
+	if (die_nest_count)
+		/* We still own the lock */
+		local_irq_restore(flags);
+	else
+		/* Nest count reaches zero, release the lock. */
+		spin_unlock_irqrestore(&die_lock, flags);
 	if (panic_on_oops)
 		panic("Oops");
 }
@@ -433,6 +444,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
 	printk(KERN_ALERT "RIP ");
 	printk_address(regs->rip); 
 	printk(" RSP <%016lx>\n", regs->rsp); 
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
 }
 
 void die(const char * str, struct pt_regs * regs, long err)
@@ -455,10 +468,14 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
 	 */
 	printk(str, safe_smp_processor_id());
 	show_registers(regs);
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
 	if (panic_on_timeout || panic_on_oops)
 		panic("nmi watchdog");
 	printk("console shuts up ...\n");
 	oops_end(flags);
+	nmi_exit();
+	local_irq_enable();
 	do_exit(SIGSEGV);
 }
 
@@ -468,8 +485,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
 {
 	struct task_struct *tsk = current;
 
-	conditional_sti(regs);
-
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = trapnr;
 
@@ -506,6 +521,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 							== NOTIFY_STOP) \
 		return; \
+	conditional_sti(regs);						\
 	do_trap(trapnr, signr, str, regs, error_code, NULL); \
 }
 
@@ -520,6 +536,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
 							== NOTIFY_STOP) \
 		return; \
+	conditional_sti(regs);						\
 	do_trap(trapnr, signr, str, regs, error_code, &info); \
 }
 
@@ -533,7 +550,17 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
 DO_ERROR(18, SIGSEGV, "reserved", reserved)
-DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+
+/* Runs on IST stack */
+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+			12, SIGBUS) == NOTIFY_STOP)
+		return;
+	preempt_conditional_sti(regs);
+	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+	preempt_conditional_cli(regs);
+}
 
 asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
 {
@@ -667,8 +694,9 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
 		return;
 	}
+	preempt_conditional_sti(regs);
 	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-	return;
+	preempt_conditional_cli(regs);
 }
 
 /* Help handler running on IST stack to switch back to user stack
@@ -973,14 +1001,14 @@ void __init trap_init(void)
 static int __init oops_dummy(char *s)
 { 
 	panic_on_oops = 1;
-	return -1; 
+	return 1;
 } 
 __setup("oops=", oops_dummy); 
 
 static int __init kstack_setup(char *s)
 {
 	kstack_depth_to_print = simple_strtoul(s,NULL,0);
-	return 0;
+	return 1;
 }
 __setup("kstack=", kstack_setup);
 
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 39ff0708f803..b81f473c4a19 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
   .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
 	*(.data.cacheline_aligned)
   }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
   .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
   	*(.data.read_mostly)
   }
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index d96a9348e5a2..1def21c9f7cd 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -102,8 +102,6 @@ EXPORT_SYMBOL(cpu_callout_map);
 EXPORT_SYMBOL(screen_info);
 #endif
 
-EXPORT_SYMBOL(get_wchan);
-
 EXPORT_SYMBOL(rtc_lock);
 
 EXPORT_SYMBOL_GPL(set_nmi_callback);
@@ -114,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
 #undef memcpy
 #undef memset
 #undef memmove
-#undef strlen
 
 extern void * memset(void *,int,__kernel_size_t);
 extern size_t strlen(const char *);
@@ -123,8 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
 extern void * __memcpy(void *,const void *,__kernel_size_t);
 
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 316c53de47bd..55250593d8c9 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -623,6 +623,6 @@ void vmalloc_sync_all(void)
 static int __init enable_pagefaulttrace(char *str)
 {
 	page_fault_trace = 1;
-	return 0;
+	return 1;
 }
 __setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b04415625442..4ba34e95d835 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,7 +72,7 @@ void show_mem(void)
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pfn_to_page(pgdat->node_start_pfn + i);
 			total++;
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
 		if (paddr >= end)
 			break;
 
-		if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+		if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
 			set_pud(pud, __pud(0)); 
 			continue;
 		} 
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 
 /*
  * Memory hotplug specific functions
- * These are only for non-NUMA machines right now.
  */
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
 
 void online_page(struct page *page)
 {
@@ -520,6 +519,38 @@ void online_page(struct page *page)
 	num_physpages++;
 }
 
+#ifndef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+	int err = -EIO;
+	unsigned long pfn;
+	unsigned long total = 0, mem = 0;
+	for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+		if (pfn_valid(pfn)) {
+			online_page(pfn_to_page(pfn));
+			err = 0;
+			mem++;
+		}
+		total++;
+	}
+	if (!err) {
+		z->spanned_pages += total;
+		z->present_pages += mem;
+		z->zone_pgdat->node_spanned_pages += total;
+		z->zone_pgdat->node_present_pages += mem;
+	}
+	return err;
+}
+#endif
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
 int add_memory(u64 start, u64 size)
 {
 	struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 63c72641b737..b2fac14baac0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
 }
 #endif
 
+static void * __init
+early_node_mem(int nodeid, unsigned long start, unsigned long end,
+	      unsigned long size)
+{
+	unsigned long mem = find_e820_area(start, end, size);
+	void *ptr;
+	if (mem != -1L)
+		return __va(mem);
+	ptr = __alloc_bootmem_nopanic(size,
+				SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+	if (ptr == 0) {
+		printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+			size, nodeid);
+		return NULL;
+	}
+	return ptr;
+}
+
 /* Initialize bootmem allocator for a node */
 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 { 
 	unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; 
 	unsigned long nodedata_phys;
+	void *bootmap;
 	const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
 
 	start = round_up(start, ZONE_ALIGN); 
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
 
-	nodedata_phys = find_e820_area(start, end, pgdat_size); 
-	if (nodedata_phys == -1L) 
-		panic("Cannot find memory pgdat in node %d\n", nodeid);
-
-	Dprintk("nodedata_phys %lx\n", nodedata_phys); 
+	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+	if (node_data[nodeid] == NULL)
+		return;
+	nodedata_phys = __pa(node_data[nodeid]);
 
-	node_data[nodeid] = phys_to_virt(nodedata_phys);
 	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	/* Find a place for the bootmem map */
 	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
 	bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
-	bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
-	if (bootmap_start == -1L) 
-		panic("Not enough continuous space for bootmap on node %d", nodeid); 
+	bootmap = early_node_mem(nodeid, bootmap_start, end,
+					bootmap_pages<<PAGE_SHIFT);
+	if (bootmap == NULL)  {
+		if (nodedata_phys < start || nodedata_phys >= end)
+			free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+		node_data[nodeid] = NULL;
+		return;
+	}
+	bootmap_start = __pa(bootmap);
 	Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); 
 	
 	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 
 	reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); 
 	reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+	srat_reserve_add_area(nodeid);
+#endif
 	node_set_online(nodeid);
 } 
 
@@ -162,11 +188,13 @@ void __init setup_node_zones(int nodeid)
 	   memory. */
 	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
 	limit = end_pfn << PAGE_SHIFT;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 	NODE_DATA(nodeid)->node_mem_map = 
 		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
 				memmapsize, SMP_CACHE_BYTES, 
 				round_down(limit - memmapsize, PAGE_SIZE), 
 				limit);
+#endif
 
 	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
@@ -335,6 +363,8 @@ __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
  	if (!strncmp(opt,"noacpi",6))
  		acpi_numa = -1;
+	if (!strncmp(opt,"hotadd=", 7))
+		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 1;
 } 
@@ -377,21 +407,6 @@ EXPORT_SYMBOL(node_data);
  * Should do that.
  */
 
-/* Requires pfn_valid(pfn) to be true */
-struct page *pfn_to_page(unsigned long pfn)
-{
-	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
-	return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
-}
-EXPORT_SYMBOL(pfn_to_page);
-
-unsigned long page_to_pfn(struct page *page)
-{
-	return (long)(((page) - page_zone(page)->zone_mem_map) +
-		      page_zone(page)->zone_start_pfn);
-}
-EXPORT_SYMBOL(page_to_pfn);
-
 int pfn_valid(unsigned long pfn)
 {
 	unsigned nid;
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..474df22c6ed2 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,29 @@
 #include <linux/bitmap.h>
 #include <linux/module.h>
 #include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
 
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+	defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+		&& !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
 static struct acpi_table_slit *acpi_slit;
 
 static nodemask_t nodes_parsed __initdata;
 static nodemask_t nodes_found __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 0;
+#ifndef RESERVE_HOTADD
+#define hotadd_percent 0	/* Ignore all settings */
+#endif
 static u8 pxm2node[256] = { [0 ... 255] = 0xff };
 
 /* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +85,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
+
+	if (found_add_area)
+		return;
+
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -88,8 +106,11 @@ static __init void bad_srat(void)
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
+	found_add_area = 0;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
+	for (i = 0; i < MAX_NUMNODES; i++)
+		nodes_add[i].start = nodes[i].end = 0;
 }
 
 static __init inline int srat_disabled(void)
@@ -137,7 +158,8 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
 	int pxm, node;
 	if (srat_disabled())
 		return;
-	if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {		bad_srat();
+	if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
+		bad_srat();
 		return;
 	}
 	if (pa->flags.enabled == 0)
@@ -155,11 +177,116 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
 	       pxm, pa->apic_id, node);
 }
 
+#ifdef RESERVE_HOTADD
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+	static unsigned long allocated;
+	static unsigned long last_area_end;
+	unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+	long mem = pages * sizeof(struct page);
+	unsigned long addr;
+	unsigned long allowed;
+	unsigned long oldpages = pages;
+
+	if (mem < 0)
+		return 0;
+	allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+	allowed = (allowed / 100) * hotadd_percent;
+	if (allocated + mem > allowed) {
+		unsigned long range;
+		/* Give them at least part of their hotadd memory upto hotadd_percent
+		   It would be better to spread the limit out
+		   over multiple hotplug areas, but that is too complicated
+		   right now */
+		if (allocated >= allowed)
+			return 0;
+		range = allowed - allocated;
+		pages = (range / PAGE_SIZE);
+		mem = pages * sizeof(struct page);
+		nd->end = nd->start + range;
+	}
+	/* Not completely fool proof, but a good sanity check */
+	addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+	if (addr == -1UL)
+		return 0;
+	if (pages != oldpages)
+		printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+			pages << PAGE_SHIFT);
+	last_area_end = addr + mem;
+	allocated += mem;
+	return 1;
+}
+
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+	unsigned long s_pfn = start >> PAGE_SHIFT;
+	unsigned long e_pfn = end >> PAGE_SHIFT;
+	int changed = 0;
+	struct bootnode *nd = &nodes_add[node];
+
+	/* I had some trouble with strange memory hotadd regions breaking
+	   the boot. Be very strict here and reject anything unexpected.
+	   If you want working memory hotadd write correct SRATs.
+
+	   The node size check is a basic sanity check to guard against
+	   mistakes */
+	if ((signed long)(end - start) < NODE_MIN_SIZE) {
+		printk(KERN_ERR "SRAT: Hotplug area too small\n");
+		return -1;
+	}
+
+	/* This check might be a bit too strict, but I'm keeping it for now. */
+	if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+		printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+		return -1;
+	}
+
+	if (!hotadd_enough_memory(&nodes_add[node]))  {
+		printk(KERN_ERR "SRAT: Hotplug area too large\n");
+		return -1;
+	}
+
+	/* Looks good */
+
+ 	found_add_area = 1;
+	if (nd->start == nd->end) {
+ 		nd->start = start;
+ 		nd->end = end;
+		changed = 1;
+ 	} else {
+ 		if (nd->start == end) {
+ 			nd->start = start;
+			changed = 1;
+		}
+ 		if (nd->end == start) {
+ 			nd->end = end;
+			changed = 1;
+		}
+		if (!changed)
+			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+ 	}
+
+ 	if ((nd->end >> PAGE_SHIFT) > end_pfn)
+ 		end_pfn = nd->end >> PAGE_SHIFT;
+
+	if (changed)
+	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+	return 0;
+}
+#endif
+
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 void __init
 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 {
-	struct bootnode *nd;
+	struct bootnode *nd, oldnode;
 	unsigned long start, end;
 	int node, pxm;
 	int i;
@@ -172,6 +299,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 	}
 	if (ma->flags.enabled == 0)
 		return;
+ 	if (ma->flags.hot_pluggable && hotadd_percent == 0)
+		return;
 	start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
 	end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
 	pxm = ma->proximity_domain;
@@ -181,10 +310,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		bad_srat();
 		return;
 	}
-	/* It is fine to add this area to the nodes data it will be used later*/
-	if (ma->flags.hot_pluggable == 1)
-		printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
-				start, end);
 	i = conflicting_nodes(start, end);
 	if (i == node) {
 		printk(KERN_WARNING
@@ -199,6 +324,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		return;
 	}
 	nd = &nodes[node];
+	oldnode = *nd;
 	if (!node_test_and_set(node, nodes_parsed)) {
 		nd->start = start;
 		nd->end = end;
@@ -208,8 +334,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
 		if (nd->end < end)
 			nd->end = end;
 	}
+
 	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ 	if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+		/* Ignore hotadd region. Undo damage */
+		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+		*nd = oldnode;
+		if ((nd->start | nd->end) == 0)
+			node_clear(node, nodes_parsed);
+	}
+#endif
 }
 
 /* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +362,9 @@ static int nodes_cover_memory(void)
 		unsigned long e = nodes[i].end >> PAGE_SHIFT;
 		pxmram += e - s;
 		pxmram -= e820_hole_size(s, e);
+		pxmram -= nodes_add[i].end - nodes_add[i].start;
+		if ((long)pxmram < 0)
+			pxmram = 0;
 	}
 
 	e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,9 +398,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
 	/* First clean up the node list */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cutoff_node(i, start, end);
-		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+ 		cutoff_node(i, start, end);
+		if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
 			unparse_node(i);
+			node_set_offline(i);
+		}
 	}
 
 	if (acpi_numa <= 0)
@@ -282,6 +424,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 	/* Finally register nodes */
 	for_each_node_mask(i, nodes_parsed)
 		setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+	/* Try again in case setup_node_bootmem missed one due
+	   to missing bootmem */
+	for_each_node_mask(i, nodes_parsed)
+		if (!node_online(i))
+			setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+
 	for (i = 0; i < NR_CPUS; i++) { 
 		if (cpu_to_node[i] == NUMA_NO_NODE)
 			continue;
@@ -303,6 +451,25 @@ static int node_to_pxm(int n)
        return 0;
 }
 
+void __init srat_reserve_add_area(int nodeid)
+{
+	if (found_add_area && nodes_add[nodeid].end) {
+		u64 total_mb;
+
+		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+				"for node %d at %Lx-%Lx\n",
+			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+					>> PAGE_SHIFT;
+		total_mb *= sizeof(struct page);
+		total_mb >>= 20;
+		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+				"pre-allocated memory.\n", (unsigned long long)total_mb);
+		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+			       nodes_add[nodeid].end - nodes_add[nodeid].start);
+	}
+}
+
 int __node_distance(int a, int b)
 {
 	int index;
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index e616500207e4..a2060e4d5de6 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -9,11 +9,16 @@
 #include <linux/init.h>
 #include <linux/acpi.h>
 #include <linux/bitmap.h>
+#include <asm/e820.h>
+
 #include "pci.h"
 
 #define MMCONFIG_APER_SIZE (256*1024*1024)
+/* Verify the first 16 busses. We assume that systems with more busses
+   get MCFG right. */
+#define MAX_CHECK_BUS 16
 
-static DECLARE_BITMAP(fallback_slots, 32);
+static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
 
 /* Static virtual mapping of the MMCONFIG aperture */
 struct mmcfg_virt {
@@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
 static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
 {
 	char __iomem *addr;
-	if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots))
+	if (seg == 0 && bus < MAX_CHECK_BUS &&
+		test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
 		return NULL;
 	addr = get_virt(seg, bus);
 	if (!addr)
@@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
 	char __iomem *addr;
 
 	/* Why do we have this when nobody checks it. How about a BUG()!? -AK */
-	if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
+	if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+		*value = -1;
 		return -EINVAL;
+	}
 
 	addr = pci_dev_base(seg, bus, devfn);
 	if (!addr)
@@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = {
    Normally this can be expressed in the MCFG by not listing them
    and assigning suitable _SEGs, but this isn't implemented in some BIOS.
    Instead try to discover all devices on bus 0 that are unreachable using MM
-   and fallback for them.
-   We only do this for bus 0/seg 0 */
+   and fallback for them. */
 static __init void unreachable_devices(void)
 {
-	int i;
-	for (i = 0; i < 32; i++) {
-		u32 val1;
-		char __iomem *addr;
-
-		pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
-		if (val1 == 0xffffffff)
-			continue;
-		addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
-		if (addr == NULL|| readl(addr) != val1) {
-			set_bit(i, fallback_slots);
+	int i, k;
+	/* Use the max bus number from ACPI here? */
+	for (k = 0; k < MAX_CHECK_BUS; k++) {
+		for (i = 0; i < 32; i++) {
+			u32 val1;
+			char __iomem *addr;
+
+			pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
+			if (val1 == 0xffffffff)
+				continue;
+			addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
+			if (addr == NULL|| readl(addr) != val1) {
+				set_bit(i + 32*k, fallback_slots);
+				printk(KERN_NOTICE
+				"PCI: No mmconfig possible on device %x:%x\n",
+					k, i);
+			}
 		}
 	}
 }
@@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void)
 	    (pci_mmcfg_config[0].base_address == 0))
 		return;
 
+	if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
+			pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
+			E820_RESERVED)) {
+		printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
+		printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+		return;
+	}
+
 	/* RED-PEN i386 doesn't do _nocache right now */
 	pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
 	if (pci_mmcfg_virt == NULL) {