summaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig24
-rw-r--r--arch/x86_64/Makefile24
-rw-r--r--arch/x86_64/boot/video.S5
-rw-r--r--arch/x86_64/defconfig57
-rw-r--r--arch/x86_64/ia32/Makefile4
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c4
-rw-r--r--arch/x86_64/ia32/ia32entry.S27
-rw-r--r--arch/x86_64/ia32/vsyscall-sigreturn.S23
-rw-r--r--arch/x86_64/kernel/Makefile4
-rw-r--r--arch/x86_64/kernel/aperture.c2
-rw-r--r--arch/x86_64/kernel/apic.c14
-rw-r--r--arch/x86_64/kernel/e820.c44
-rw-r--r--arch/x86_64/kernel/early_printk.c2
-rw-r--r--arch/x86_64/kernel/entry.S35
-rw-r--r--arch/x86_64/kernel/io_apic.c35
-rw-r--r--arch/x86_64/kernel/kprobes.c16
-rw-r--r--arch/x86_64/kernel/mce.c14
-rw-r--r--arch/x86_64/kernel/mce_amd.c2
-rw-r--r--arch/x86_64/kernel/mpparse.c12
-rw-r--r--arch/x86_64/kernel/nmi.c7
-rw-r--r--arch/x86_64/kernel/pci-dma.c6
-rw-r--r--arch/x86_64/kernel/pci-gart.c18
-rw-r--r--arch/x86_64/kernel/pci-nommu.c7
-rw-r--r--arch/x86_64/kernel/pmtimer.c4
-rw-r--r--arch/x86_64/kernel/process.c31
-rw-r--r--arch/x86_64/kernel/ptrace.c11
-rw-r--r--arch/x86_64/kernel/setup.c64
-rw-r--r--arch/x86_64/kernel/setup64.c4
-rw-r--r--arch/x86_64/kernel/smpboot.c26
-rw-r--r--arch/x86_64/kernel/time.c49
-rw-r--r--arch/x86_64/kernel/traps.c60
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c5
-rw-r--r--arch/x86_64/mm/fault.c2
-rw-r--r--arch/x86_64/mm/init.c39
-rw-r--r--arch/x86_64/mm/numa.c63
-rw-r--r--arch/x86_64/mm/srat.c183
-rw-r--r--arch/x86_64/pci/mmconfig.c53
38 files changed, 716 insertions, 266 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0ca88f8..6527a368fb9c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -136,6 +136,11 @@ config X86_L1_CACHE_SHIFT
default "7" if GENERIC_CPU || MPSC
default "6" if MK8
+config X86_INTERNODE_CACHE_BYTES
+ int
+ default "4096" if X86_VSMP
+ default X86_L1_CACHE_BYTES if !X86_VSMP
+
config X86_TSC
bool
default y
@@ -250,6 +255,15 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
+config SCHED_MC
+ bool "Multi-core scheduler support"
+ depends on SMP
+ default y
+ help
+ Multi-core scheduler support improves the CPU scheduler's decision
+ making when dealing with multi-core CPU chips at a cost of slightly
+ increased overhead in some places. If unsure say N here.
+
source "kernel/Kconfig.preempt"
config NUMA
@@ -274,12 +288,18 @@ config K8_NUMA
Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
instead, which also takes priority if both are compiled in.
+config NODES_SHIFT
+ int
+ default "6"
+ depends on NEED_MULTIPLE_NODES
+
# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.
config X86_64_ACPI_NUMA
bool "ACPI NUMA detection"
depends on NUMA
select ACPI
+ select PCI
select ACPI_NUMA
default y
help
@@ -325,6 +345,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool y
depends on NUMA
+config OUT_OF_LINE_PFN_TO_PAGE
+ def_bool y
+ depends on DISCONTIGMEM
+
config NR_CPUS
int "Maximum number of CPUs (2-256)"
range 2 255
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 585fd4a559c8..e573e2ab5510 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -24,37 +24,37 @@
LDFLAGS := -m elf_x86_64
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
LDFLAGS_vmlinux :=
-
CHECKFLAGS += -D__x86_64__ -m64
+cflags-y :=
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
-CFLAGS += $(cflags-y)
-CFLAGS += -m64
-CFLAGS += -mno-red-zone
-CFLAGS += -mcmodel=kernel
-CFLAGS += -pipe
+cflags-y += -m64
+cflags-y += -mno-red-zone
+cflags-y += -mcmodel=kernel
+cflags-y += -pipe
cflags-$(CONFIG_REORDER) += -ffunction-sections
# this makes reading assembly source easier, but produces worse code
# actually it makes the kernel smaller too.
-CFLAGS += -fno-reorder-blocks
-CFLAGS += -Wno-sign-compare
+cflags-y += -fno-reorder-blocks
+cflags-y += -Wno-sign-compare
ifneq ($(CONFIG_UNWIND_INFO),y)
-CFLAGS += -fno-asynchronous-unwind-tables
+cflags-y += -fno-asynchronous-unwind-tables
endif
ifneq ($(CONFIG_DEBUG_INFO),y)
# -fweb shrinks the kernel a bit, but the difference is very small
# it also messes up debugging, so don't use it for now.
-#CFLAGS += $(call cc-option,-fweb)
+#cflags-y += $(call cc-option,-fweb)
endif
# -funit-at-a-time shrinks the kernel .text considerably
# unfortunately it makes reading oopses harder.
-CFLAGS += $(call cc-option,-funit-at-a-time)
+cflags-y += $(call cc-option,-funit-at-a-time)
# prevent gcc from generating any FP code by mistake
-CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+CFLAGS += $(cflags-y)
AFLAGS += -m64
head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
index 0587477c99f2..32327bb37aff 100644
--- a/arch/x86_64/boot/video.S
+++ b/arch/x86_64/boot/video.S
@@ -97,6 +97,7 @@
#define PARAM_VESAPM_OFF 0x30
#define PARAM_LFB_PAGES 0x32
#define PARAM_VESA_ATTRIB 0x34
+#define PARAM_CAPABILITIES 0x36
/* Define DO_STORE according to CONFIG_VIDEO_RETAIN */
#ifdef CONFIG_VIDEO_RETAIN
@@ -233,6 +234,10 @@ mopar_gr:
movw 18(%di), %ax
movl %eax, %fs:(PARAM_LFB_SIZE)
+# store mode capabilities
+ movl 10(%di), %eax
+ movl %eax, %fs:(PARAM_CAPABILITIES)
+
# switching the DAC to 8-bit is for <= 8 bpp only
movw %fs:(PARAM_LFB_DEPTH), %ax
cmpw $8, %ax
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 566ecc97ee5a..69db0c0721d1 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,7 +1,7 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16-git9
-# Sat Mar 25 15:18:40 2006
+# Linux kernel version: 2.6.17-rc1-git11
+# Sun Apr 16 07:22:36 2006
#
CONFIG_X86_64=y
CONFIG_64BIT=y
@@ -9,6 +9,7 @@ CONFIG_X86=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_X86_CMPXCHG=y
CONFIG_EARLY_PRINTK=y
@@ -55,11 +56,8 @@ CONFIG_BASE_FULL=y
CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SHMEM=y
-CONFIG_CC_ALIGN_FUNCTIONS=0
-CONFIG_CC_ALIGN_LABELS=0
-CONFIG_CC_ALIGN_LOOPS=0
-CONFIG_CC_ALIGN_JUMPS=0
CONFIG_SLAB=y
+CONFIG_DOUBLEFAULT=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
# CONFIG_SLOB is not set
@@ -70,7 +68,6 @@ CONFIG_BASE_SMALL=0
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_OBSOLETE_MODPARM=y
# CONFIG_MODVERSIONS is not set
# CONFIG_MODULE_SRCVERSION_ALL is not set
# CONFIG_KMOD is not set
@@ -81,6 +78,7 @@ CONFIG_STOP_MACHINE=y
#
CONFIG_LBD=y
# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_LSF is not set
#
# IO Schedulers
@@ -105,6 +103,7 @@ CONFIG_X86_PC=y
CONFIG_GENERIC_CPU=y
CONFIG_X86_L1_CACHE_BYTES=128
CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_INTERNODE_CACHE_BYTES=128
CONFIG_X86_TSC=y
CONFIG_X86_GOOD_APIC=y
# CONFIG_MICROCODE is not set
@@ -116,12 +115,14 @@ CONFIG_X86_LOCAL_APIC=y
CONFIG_MTRR=y
CONFIG_SMP=y
CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
# CONFIG_PREEMPT_NONE is not set
CONFIG_PREEMPT_VOLUNTARY=y
# CONFIG_PREEMPT is not set
CONFIG_PREEMPT_BKL=y
CONFIG_NUMA=y
CONFIG_K8_NUMA=y
+CONFIG_NODES_SHIFT=6
CONFIG_X86_64_ACPI_NUMA=y
CONFIG_NUMA_EMU=y
CONFIG_ARCH_DISCONTIGMEM_ENABLE=y
@@ -138,6 +139,7 @@ CONFIG_NEED_MULTIPLE_NODES=y
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
+CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
CONFIG_HOTPLUG_CPU=y
CONFIG_HPET_TIMER=y
@@ -289,6 +291,7 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_AH is not set
# CONFIG_INET_ESP is not set
# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
# CONFIG_INET_TUNNEL is not set
CONFIG_INET_DIAG=y
CONFIG_INET_TCP_DIAG=y
@@ -300,6 +303,7 @@ CONFIG_IPV6=y
# CONFIG_INET6_AH is not set
# CONFIG_INET6_ESP is not set
# CONFIG_INET6_IPCOMP is not set
+# CONFIG_INET6_XFRM_TUNNEL is not set
# CONFIG_INET6_TUNNEL is not set
# CONFIG_IPV6_TUNNEL is not set
# CONFIG_NETFILTER is not set
@@ -542,7 +546,6 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
# CONFIG_SCSI_INIA100 is not set
# CONFIG_SCSI_SYM53C8XX_2 is not set
# CONFIG_SCSI_IPR is not set
-# CONFIG_SCSI_QLOGIC_FC is not set
# CONFIG_SCSI_QLOGIC_1280 is not set
# CONFIG_SCSI_QLA_FC is not set
# CONFIG_SCSI_LPFC is not set
@@ -704,7 +707,6 @@ CONFIG_S2IO=m
# Wireless LAN (non-hamradio)
#
# CONFIG_NET_RADIO is not set
-# CONFIG_NET_WIRELESS_RTNETLINK is not set
#
# Wan interfaces
@@ -791,7 +793,7 @@ CONFIG_HW_CONSOLE=y
#
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
-# CONFIG_SERIAL_8250_ACPI is not set
+CONFIG_SERIAL_8250_PCI=y
CONFIG_SERIAL_8250_NR_UARTS=4
CONFIG_SERIAL_8250_RUNTIME_UARTS=4
# CONFIG_SERIAL_8250_EXTENDED is not set
@@ -921,6 +923,7 @@ CONFIG_HWMON=y
# Digital Video Broadcasting Devices
#
# CONFIG_DVB is not set
+# CONFIG_USB_DABUSB is not set
#
# Graphics support
@@ -932,6 +935,8 @@ CONFIG_VIDEO_SELECT=y
# Console display driver support
#
CONFIG_VGA_CONSOLE=y
+CONFIG_VGACON_SOFT_SCROLLBACK=y
+CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
CONFIG_DUMMY_CONSOLE=y
#
@@ -1041,9 +1046,7 @@ CONFIG_USB_HIDINPUT=y
# CONFIG_USB_ACECAD is not set
# CONFIG_USB_KBTAB is not set
# CONFIG_USB_POWERMATE is not set
-# CONFIG_USB_MTOUCH is not set
-# CONFIG_USB_ITMTOUCH is not set
-# CONFIG_USB_EGALAX is not set
+# CONFIG_USB_TOUCHSCREEN is not set
# CONFIG_USB_YEALINK is not set
# CONFIG_USB_XPAD is not set
# CONFIG_USB_ATI_REMOTE is not set
@@ -1058,15 +1061,6 @@ CONFIG_USB_HIDINPUT=y
# CONFIG_USB_MICROTEK is not set
#
-# USB Multimedia devices
-#
-# CONFIG_USB_DABUSB is not set
-
-#
-# Video4Linux support is needed for USB Multimedia device support
-#
-
-#
# USB Network Adapters
#
# CONFIG_USB_CATC is not set
@@ -1118,9 +1112,23 @@ CONFIG_USB_MON=y
# CONFIG_MMC is not set
#
+# LED devices
+#
+# CONFIG_NEW_LEDS is not set
+
+#
+# LED drivers
+#
+
+#
+# LED Triggers
+#
+
+#
# InfiniBand support
#
# CONFIG_INFINIBAND is not set
+# CONFIG_IPATH_CORE is not set
#
# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1128,6 +1136,11 @@ CONFIG_USB_MON=y
# CONFIG_EDAC is not set
#
+# Real Time Clock
+#
+# CONFIG_RTC_CLASS is not set
+
+#
# Firmware Drivers
#
# CONFIG_EDD is not set
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index 929e6b0771f8..e9263b4975e0 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -27,5 +27,5 @@ $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \
$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE
$(call if_changed,syscall)
-AFLAGS_vsyscall-sysenter.o = -m32
-AFLAGS_vsyscall-syscall.o = -m32
+AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32
+AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index e776139afb20..926c4743d13b 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -339,7 +339,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
struct mm_struct *mm = current->mm;
int i, ret;
- stack_base = IA32_STACK_TOP - MAX_ARG_PAGES * PAGE_SIZE;
+ stack_base = stack_top - MAX_ARG_PAGES * PAGE_SIZE;
mm->arg_start = bprm->p + stack_base;
bprm->p += stack_base;
@@ -357,7 +357,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top,
{
mpnt->vm_mm = mm;
mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
- mpnt->vm_end = IA32_STACK_TOP;
+ mpnt->vm_end = stack_top;
if (executable_stack == EXSTACK_ENABLE_X)
mpnt->vm_flags = VM_STACK_FLAGS | VM_EXEC;
else if (executable_stack == EXSTACK_DISABLE_X)
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 7549a4389fbf..5a92fed2d1d5 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -15,6 +15,8 @@
#include <asm/vsyscall32.h>
#include <linux/linkage.h>
+#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
+
.macro IA32_ARG_FIXUP noebp=0
movl %edi,%r8d
.if \noebp
@@ -109,8 +111,8 @@ ENTRY(ia32_sysenter_target)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
sysenter_do_call:
- cmpl $(IA32_NR_syscalls),%eax
- jae ia32_badsys
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -210,8 +212,8 @@ ENTRY(ia32_cstar_target)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cstar_do_call:
- cmpl $IA32_NR_syscalls,%eax
- jae ia32_badsys
+ cmpl $IA32_NR_syscalls-1,%eax
+ ja ia32_badsys
IA32_ARG_FIXUP 1
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -296,8 +298,8 @@ ENTRY(ia32_syscall)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10)
jnz ia32_tracesys
ia32_do_syscall:
- cmpl $(IA32_NR_syscalls),%eax
- jae ia32_badsys
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja ia32_badsys
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
@@ -685,10 +687,13 @@ ia32_sys_call_table:
.quad sys_readlinkat /* 305 */
.quad sys_fchmodat
.quad sys_faccessat
- .quad sys_ni_syscall /* pselect6 for now */
- .quad sys_ni_syscall /* ppoll for now */
+ .quad quiet_ni_syscall /* pselect6 for now */
+ .quad quiet_ni_syscall /* ppoll for now */
.quad sys_unshare /* 310 */
+ .quad compat_sys_set_robust_list
+ .quad compat_sys_get_robust_list
+ .quad sys_splice
+ .quad sys_sync_file_range
+ .quad sys_tee
+ .quad compat_sys_vmsplice
ia32_syscall_end:
- .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8
- .quad ni_syscall
- .endr
diff --git a/arch/x86_64/ia32/vsyscall-sigreturn.S b/arch/x86_64/ia32/vsyscall-sigreturn.S
index d90321fe9bba..1384367cdbe1 100644
--- a/arch/x86_64/ia32/vsyscall-sigreturn.S
+++ b/arch/x86_64/ia32/vsyscall-sigreturn.S
@@ -32,9 +32,28 @@ __kernel_rt_sigreturn:
.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
.section .eh_frame,"a",@progbits
+.LSTARTFRAMES:
+ .long .LENDCIES-.LSTARTCIES
+.LSTARTCIES:
+ .long 0 /* CIE ID */
+ .byte 1 /* Version number */
+ .string "zRS" /* NUL-terminated augmentation string */
+ .uleb128 1 /* Code alignment factor */
+ .sleb128 -4 /* Data alignment factor */
+ .byte 8 /* Return address register column */
+ .uleb128 1 /* Augmentation value length */
+ .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
+ .byte 0x0c /* DW_CFA_def_cfa */
+ .uleb128 4
+ .uleb128 4
+ .byte 0x88 /* DW_CFA_offset, column 0x8 */
+ .uleb128 1
+ .align 4
+.LENDCIES:
+
.long .LENDFDE2-.LSTARTFDE2 /* Length FDE */
.LSTARTFDE2:
- .long .LSTARTFDE2-.LSTARTFRAME /* CIE pointer */
+ .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */
/* HACK: The dwarf2 unwind routines will subtract 1 from the
return address to get an address in the middle of the
presumed call instruction. Since we didn't get here via
@@ -97,7 +116,7 @@ __kernel_rt_sigreturn:
.long .LENDFDE3-.LSTARTFDE3 /* Length FDE */
.LSTARTFDE3:
- .long .LSTARTFDE3-.LSTARTFRAME /* CIE pointer */
+ .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */
/* HACK: See above wrt unwind library assumptions. */
.long .LSTART_rt_sigreturn-1-. /* PC-relative start address */
.long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index a098a11e7755..059c88313f4e 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
x8664_ksyms.o i387.o syscall.o vsyscall.o \
setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
- dmi_scan.o pci-dma.o pci-nommu.o
+ pci-dma.o pci-nommu.o
obj-$(CONFIG_X86_MCE) += mce.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
@@ -49,5 +49,3 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
-dmi_scan-y += ../../i386/kernel/dmi_scan.o
-
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index fffd6b0a2fab..70b9d21ed675 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -80,7 +80,7 @@ static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
return 0;
}
- if (e820_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
+ if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
return 0;
}
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index d54620147e8e..100a30c40044 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -615,7 +615,7 @@ static int __init apic_set_verbosity(char *str)
printk(KERN_WARNING "APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug", str);
- return 0;
+ return 1;
}
__setup("apic=", apic_set_verbosity);
@@ -1137,35 +1137,35 @@ int __init APIC_init_uniprocessor (void)
static __init int setup_disableapic(char *str)
{
disable_apic = 1;
- return 0;
+ return 1;
}
static __init int setup_nolapic(char *str)
{
disable_apic = 1;
- return 0;
+ return 1;
}
static __init int setup_noapictimer(char *str)
{
if (str[0] != ' ' && str[0] != 0)
- return -1;
+ return 0;
disable_apic_timer = 1;
- return 0;
+ return 1;
}
static __init int setup_apicmaintimer(char *str)
{
apic_runs_main_timer = 1;
nohpet = 1;
- return 0;
+ return 1;
}
__setup("apicmaintimer", setup_apicmaintimer);
static __init int setup_noapicmaintimer(char *str)
{
apic_runs_main_timer = -1;
- return 0;
+ return 1;
}
__setup("noapicmaintimer", setup_noapicmaintimer);
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 293cd71a266a..1ef6028f721e 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -76,11 +76,22 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
*addrp = __pa_symbol(&_end);
return 1;
}
+
+ if (last >= ebda_addr && addr < ebda_addr + ebda_size) {
+ *addrp = ebda_addr + ebda_size;
+ return 1;
+ }
+
/* XXX ramdisk image here? */
return 0;
}
-int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
+/*
+ * This function checks if any part of the range <start,end> is mapped
+ * with type.
+ */
+int __meminit
+e820_any_mapped(unsigned long start, unsigned long end, unsigned type)
{
int i;
for (i = 0; i < e820.nr_map; i++) {
@@ -94,6 +105,35 @@ int __init e820_mapped(unsigned long start, unsigned long end, unsigned type)
return 0;
}
+/*
+ * This function checks if the entire range <start,end> is mapped with type.
+ *
+ * Note: this function only works correct if the e820 table is sorted and
+ * not-overlapping, which is the case
+ */
+int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type)
+{
+ int i;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ if (type && ei->type != type)
+ continue;
+ /* is the region (part) in overlap with the current region ?*/
+ if (ei->addr >= end || ei->addr + ei->size <= start)
+ continue;
+
+ /* if the region is at the beginning of <start,end> we move
+ * start to the end of the region since it's ok until there
+ */
+ if (ei->addr <= start)
+ start = ei->addr + ei->size;
+ /* if start is now at or beyond end, we're done, full coverage */
+ if (start >= end)
+ return 1; /* we're done */
+ }
+ return 0;
+}
+
/*
* Find a free area in a specific range.
*/
@@ -109,7 +149,7 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
addr = start;
if (addr > ei->addr + ei->size)
continue;
- while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size)
+ while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
;
last = addr + size;
if (last > ei->addr + ei->size)
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index 13af920b6594..b93ef5b51980 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -221,7 +221,7 @@ int __init setup_early_printk(char *opt)
char buf[256];
if (early_console_initialized)
- return -1;
+ return 1;
strlcpy(buf,opt,sizeof(buf));
space = strchr(buf, ' ');
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 8538bfea30e6..586b34c00c48 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -180,6 +180,10 @@ rff_trace:
*
* XXX if we had a free scratch register we could save the RSP into the stack frame
* and report it properly in ps. Unfortunately we haven't.
+ *
+ * When user can change the frames always force IRET. That is because
+ * it deals with uncanonical addresses better. SYSRET has trouble
+ * with them due to bugs in both AMD and Intel CPUs.
*/
ENTRY(system_call)
@@ -254,7 +258,10 @@ sysret_signal:
xorl %esi,%esi # oldset -> arg2
call ptregscall_common
1: movl $_TIF_NEED_RESCHED,%edi
- jmp sysret_check
+ /* Use IRET because user could have changed frame. This
+ works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
+ cli
+ jmp int_with_check
badsys:
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -274,13 +281,9 @@ tracesys:
ja 1f
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
-1: SAVE_REST
- movq %rsp,%rdi
- call syscall_trace_leave
- RESTORE_TOP_OF_STACK %rbx
- RESTORE_REST
- jmp ret_from_sys_call
+1: movq %rax,RAX-ARGOFFSET(%rsp)
+ /* Use IRET because user could have changed frame */
+ jmp int_ret_from_sys_call
CFI_ENDPROC
/*
@@ -408,25 +411,9 @@ ENTRY(stub_execve)
CFI_ADJUST_CFA_OFFSET -8
CFI_REGISTER rip, r11
SAVE_REST
- movq %r11, %r15
- CFI_REGISTER rip, r15
FIXUP_TOP_OF_STACK %r11
call sys_execve
- GET_THREAD_INFO(%rcx)
- bt $TIF_IA32,threadinfo_flags(%rcx)
- CFI_REMEMBER_STATE
- jc exec_32bit
RESTORE_TOP_OF_STACK %r11
- movq %r15, %r11
- CFI_REGISTER rip, r11
- RESTORE_REST
- pushq %r11
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rip, 0
- ret
-
-exec_32bit:
- CFI_RESTORE_STATE
movq %rax,RAX(%rsp)
RESTORE_REST
jmp int_ret_from_sys_call
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 77b4c608cca0..9cc7031b7151 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -271,6 +271,18 @@ __setup("enable_8254_timer", setup_enable_8254_timer);
#include <linux/pci_ids.h>
#include <linux/pci.h>
+
+#ifdef CONFIG_ACPI
+
+static int nvidia_hpet_detected __initdata;
+
+static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
+{
+ nvidia_hpet_detected = 1;
+ return 0;
+}
+#endif
+
/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
off. Check for an Nvidia or VIA PCI bridge and turn it off.
Use pci direct infrastructure because this runs before the PCI subsystem.
@@ -317,11 +329,19 @@ void __init check_ioapic(void)
return;
case PCI_VENDOR_ID_NVIDIA:
#ifdef CONFIG_ACPI
- /* All timer overrides on Nvidia
- seem to be wrong. Skip them. */
- acpi_skip_timer_override = 1;
- printk(KERN_INFO
- "Nvidia board detected. Ignoring ACPI timer override.\n");
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+ */
+ nvidia_hpet_detected = 0;
+ acpi_table_parse(ACPI_HPET,
+ nvidia_hpet_check);
+ if (nvidia_hpet_detected == 0) {
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ }
#endif
/* RED-PEN skip them on mptables too? */
return;
@@ -1777,6 +1797,8 @@ static inline void unlock_ExtINT_logic(void)
spin_unlock_irqrestore(&ioapic_lock, flags);
}
+int timer_uses_ioapic_pin_0;
+
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -1814,6 +1836,9 @@ static inline void check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
+ if (pin1 == 0)
+ timer_uses_ioapic_pin_0 = 1;
+
apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index accbff3fec49..fa1d19ca700a 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -53,7 +53,7 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
/*
* returns non-zero if opcode modifies the interrupt flag.
*/
-static inline int is_IF_modifier(kprobe_opcode_t *insn)
+static __always_inline int is_IF_modifier(kprobe_opcode_t *insn)
{
switch (*insn) {
case 0xfa: /* cli */
@@ -84,7 +84,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
* If it does, return the address of the 32-bit displacement word.
* If not, return null.
*/
-static inline s32 *is_riprel(u8 *insn)
+static s32 __kprobes *is_riprel(u8 *insn)
{
#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -229,7 +229,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p)
mutex_unlock(&kprobe_mutex);
}
-static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
@@ -237,7 +237,7 @@ static inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags;
}
-static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
kcb->kprobe_status = kcb->prev_kprobe.status;
@@ -245,7 +245,7 @@ static inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags;
}
-static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
__get_cpu_var(current_kprobe) = p;
@@ -514,13 +514,13 @@ static void __kprobes resume_execution(struct kprobe *p,
*tos = orig_rip + (*tos - copy_rip);
break;
case 0xff:
- if ((*insn & 0x30) == 0x10) {
+ if ((insn[1] & 0x30) == 0x10) {
/* call absolute, indirect */
/* Fix return addr; rip is correct. */
next_rip = regs->rip;
*tos = orig_rip + (*tos - copy_rip);
- } else if (((*insn & 0x31) == 0x20) || /* jmp near, absolute indirect */
- ((*insn & 0x31) == 0x21)) { /* jmp far, absolute indirect */
+ } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */
+ ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */
/* rip is correct. */
next_rip = regs->rip;
}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 04282ef9fbd4..c69fc43cee7b 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -29,6 +29,8 @@
#define MISC_MCELOG_MINOR 227
#define NR_BANKS 6
+atomic_t mce_entry;
+
static int mce_dont_init;
/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
@@ -172,10 +174,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
int i;
int panicm_found = 0;
+ atomic_inc(&mce_entry);
+
if (regs)
notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
if (!banks)
- return;
+ goto out2;
memset(&m, 0, sizeof(struct mce));
m.cpu = safe_smp_processor_id();
@@ -266,6 +270,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
out:
/* Last thing done in the machine check exception to clear state. */
wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ out2:
+ atomic_dec(&mce_entry);
}
/*
@@ -501,7 +507,7 @@ static struct miscdevice mce_log_device = {
static int __init mcheck_disable(char *str)
{
mce_dont_init = 1;
- return 0;
+ return 1;
}
/* mce=off disables machine check. Note you can reenable it later
@@ -521,7 +527,7 @@ static int __init mcheck_enable(char *str)
get_option(&str, &tolerant);
else
printk("mce= argument %s ignored. Please use /sys", str);
- return 0;
+ return 1;
}
__setup("nomce", mcheck_disable);
@@ -623,7 +629,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
#endif
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static __cpuinit int
+static int
mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d3ad7d81266d..d13b241ad094 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -482,7 +482,7 @@ static void threshold_remove_device(unsigned int cpu)
#endif
/* get notified when a cpu comes on/off */
-static __cpuinit int threshold_cpu_callback(struct notifier_block *nfb,
+static int threshold_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
/* cpu was unsigned int to begin with */
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index b17cf3eba359..083da7e606b1 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -968,7 +968,17 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
*/
int irq = gsi;
if (gsi < MAX_GSI_NUM) {
- if (gsi > 15)
+ /*
+ * Retain the VIA chipset work-around (gsi > 15), but
+ * avoid a problem where the 8254 timer (IRQ0) is setup
+ * via an override (so it's not on pin 0 of the ioapic),
+ * and at the same time, the pin 0 interrupt is a PCI
+ * type. The gsi > 15 test could cause these two pins
+ * to be shared as IRQ0, and they are not shareable.
+ * So test for this condition, and if necessary, avoid
+ * the pin collision.
+ */
+ if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
gsi = pci_irq++;
/*
* Don't assign IRQ used by ACPI SCI
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index d9e4067faf05..4e6357fe0ec3 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -34,6 +34,7 @@
#include <asm/proto.h>
#include <asm/kdebug.h>
#include <asm/local.h>
+#include <asm/mce.h>
/*
* lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
@@ -480,6 +481,12 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
+#ifdef CONFIG_X86_MCE
+ /* Could check oops_in_progress here too, but it's safer
+ not too */
+ if (atomic_read(&mce_entry) > 0)
+ touched = 1;
+#endif
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index 03c9eeedb0f3..a9275c9557cf 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -48,10 +48,16 @@ dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
struct page *page;
int node;
+#ifdef CONFIG_PCI
if (dev->bus == &pci_bus_type)
node = pcibus_to_node(to_pci_dev(dev)->bus);
else
+#endif
node = numa_node_id();
+
+ if (node < first_node(node_online_map))
+ node = first_node(node_online_map);
+
page = alloc_pages_node(node, gfp, order);
return page ? page_address(page) : NULL;
}
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index a6c01e121266..82a7c9bfdfa0 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -112,10 +112,6 @@ static unsigned long alloc_iommu(int size)
static void free_iommu(unsigned long offset, int size)
{
unsigned long flags;
- if (size == 1) {
- clear_bit(offset, iommu_gart_bitmap);
- return;
- }
spin_lock_irqsave(&iommu_bitmap_lock, flags);
__clear_bit_string(iommu_gart_bitmap, offset, size);
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -635,14 +631,20 @@ static int __init pci_iommu_init(void)
printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n");
if (end_pfn > MAX_DMA32_PFN) {
printk(KERN_ERR "WARNING more than 4GB of memory "
- "but IOMMU not compiled in.\n"
- KERN_ERR "WARNING 32bit PCI may malfunction.\n"
- KERN_ERR "You might want to enable "
- "CONFIG_GART_IOMMU\n");
+ "but IOMMU not available.\n"
+ KERN_ERR "WARNING 32bit PCI may malfunction.\n");
}
return -1;
}
+ i = 0;
+ for_all_nb(dev)
+ i++;
+ if (i > MAX_NB) {
+ printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i);
+ return -1;
+ }
+
printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
aper_size = info.aper_size * 1024 * 1024;
iommu_size = check_iommu_size(info.aper_base, aper_size);
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 44adcc2d5e5b..1f6ecc62061d 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -12,9 +12,10 @@ static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
if (hwdev && bus + size > *hwdev->dma_mask) {
- printk(KERN_ERR
- "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
- name, (long long)bus, size, (long long)*hwdev->dma_mask);
+ if (*hwdev->dma_mask >= 0xffffffffULL)
+ printk(KERN_ERR
+ "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
+ name, (long long)bus, size, (long long)*hwdev->dma_mask);
return 0;
}
return 1;
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index ee5ee4891f3d..bf421ed26808 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -68,7 +68,7 @@ int pmtimer_mark_offset(void)
offset_delay = delta % (USEC_PER_SEC / HZ);
rdtscll(tsc);
- vxtime.last_tsc = tsc - offset_delay * cpu_khz;
+ vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000;
/* don't calculate delay for first run,
or if we've got less then a tick */
@@ -121,7 +121,7 @@ unsigned int do_gettimeoffset_pm(void)
static int __init nopmtimer_setup(char *s)
{
pmtmr_ioport = 0;
- return 0;
+ return 1;
}
__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 0370720515f1..fb903e65e079 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -66,24 +66,17 @@ EXPORT_SYMBOL(boot_option_idle_override);
void (*pm_idle)(void);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
-static struct notifier_block *idle_notifier;
-static DEFINE_SPINLOCK(idle_notifier_lock);
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
void idle_notifier_register(struct notifier_block *n)
{
- unsigned long flags;
- spin_lock_irqsave(&idle_notifier_lock, flags);
- notifier_chain_register(&idle_notifier, n);
- spin_unlock_irqrestore(&idle_notifier_lock, flags);
+ atomic_notifier_chain_register(&idle_notifier, n);
}
EXPORT_SYMBOL_GPL(idle_notifier_register);
void idle_notifier_unregister(struct notifier_block *n)
{
- unsigned long flags;
- spin_lock_irqsave(&idle_notifier_lock, flags);
- notifier_chain_unregister(&idle_notifier, n);
- spin_unlock_irqrestore(&idle_notifier_lock, flags);
+ atomic_notifier_chain_unregister(&idle_notifier, n);
}
EXPORT_SYMBOL(idle_notifier_unregister);
@@ -93,13 +86,13 @@ static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
void enter_idle(void)
{
__get_cpu_var(idle_state) = CPU_IDLE;
- notifier_call_chain(&idle_notifier, IDLE_START, NULL);
+ atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
__get_cpu_var(idle_state) = CPU_NOT_IDLE;
- notifier_call_chain(&idle_notifier, IDLE_END, NULL);
+ atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
/* Called from interrupts to signify idle end */
@@ -582,8 +575,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->userrsp = read_pda(oldrsp);
write_pda(oldrsp, next->userrsp);
write_pda(pcurrent, next_p);
+
/* This must be here to ensure both math_state_restore() and
- kernel_fpu_begin() work consistently. */
+ kernel_fpu_begin() work consistently.
+ And the AMD workaround requires it to be after DS reload. */
unlazy_fpu(prev_p);
write_pda(kernelstack,
task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
@@ -788,10 +783,16 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
}
case ARCH_GET_GS: {
unsigned long base;
+ unsigned gsindex;
if (task->thread.gsindex == GS_TLS_SEL)
base = read_32bit_tls(task, GS_TLS);
- else if (doit)
- rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else if (doit) {
+ asm("movl %%gs,%0" : "=r" (gsindex));
+ if (gsindex)
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ base = task->thread.gs;
+ }
else
base = task->thread.gs;
ret = put_user(base, (unsigned long __user *)addr);
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index d44b2c1e63a6..2d50024c9f30 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -274,11 +274,6 @@ static int putreg(struct task_struct *child,
return -EIO;
value &= 0xffff;
break;
- case offsetof(struct user_regs_struct, rip):
- /* Check if the new RIP address is canonical */
- if (value >= TASK_SIZE_OF(child))
- return -EIO;
- break;
}
put_stack_long(child, regno - sizeof(struct pt_regs), value);
return 0;
@@ -605,12 +600,12 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
if (unlikely(current->audit_context)) {
if (test_thread_flag(TIF_IA32)) {
- audit_syscall_entry(current, AUDIT_ARCH_I386,
+ audit_syscall_entry(AUDIT_ARCH_I386,
regs->orig_rax,
regs->rbx, regs->rcx,
regs->rdx, regs->rsi);
} else {
- audit_syscall_entry(current, AUDIT_ARCH_X86_64,
+ audit_syscall_entry(AUDIT_ARCH_X86_64,
regs->orig_rax,
regs->rdi, regs->rsi,
regs->rdx, regs->r10);
@@ -621,7 +616,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
asmlinkage void syscall_trace_leave(struct pt_regs *regs)
{
if (unlikely(current->audit_context))
- audit_syscall_exit(current, AUDITSC_RESULT(regs->rax), regs->rax);
+ audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
if ((test_thread_flag(TIF_SYSCALL_TRACE)
|| test_thread_flag(TIF_SINGLESTEP))
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8311a7..655b9192eeb3 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -353,8 +353,10 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (fullarg(from, "enable_timer_pin_1"))
disable_timer_pin_1 = -1;
- if (fullarg(from, "nolapic") || fullarg(from, "disableapic"))
+ if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
disable_apic = 1;
+ }
if (fullarg(from, "noapic"))
skip_ioapic_setup = 1;
@@ -540,7 +542,7 @@ void __init alternative_instructions(void)
static int __init noreplacement_setup(char *s)
{
no_replacement = 1;
- return 0;
+ return 1;
}
__setup("noreplacement", noreplacement_setup);
@@ -569,17 +571,28 @@ static inline void copy_edd(void)
#endif
#define EBDA_ADDR_POINTER 0x40E
-static void __init reserve_ebda_region(void)
+
+unsigned __initdata ebda_addr;
+unsigned __initdata ebda_size;
+
+static void discover_ebda(void)
{
- unsigned int addr;
- /**
+ /*
* there is a real-mode segmented pointer pointing to the
* 4K EBDA area at 0x40E
*/
- addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER);
- addr <<= 4;
- if (addr)
- reserve_bootmem_generic(addr, PAGE_SIZE);
+ ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER;
+ ebda_addr <<= 4;
+
+ ebda_size = *(unsigned short *)(unsigned long)ebda_addr;
+
+ /* Round EBDA up to pages */
+ if (ebda_size == 0)
+ ebda_size = 1;
+ ebda_size <<= 10;
+ ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
+ if (ebda_size > 64*1024)
+ ebda_size = 64*1024;
}
void __init setup_arch(char **cmdline_p)
@@ -625,6 +638,8 @@ void __init setup_arch(char **cmdline_p)
check_efer();
+ discover_ebda();
+
init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
dmi_scan_machine();
@@ -667,7 +682,8 @@ void __init setup_arch(char **cmdline_p)
reserve_bootmem_generic(0, PAGE_SIZE);
/* reserve ebda region */
- reserve_ebda_region();
+ if (ebda_addr)
+ reserve_bootmem_generic(ebda_addr, ebda_size);
#ifdef CONFIG_SMP
/*
@@ -928,6 +944,10 @@ static int __init init_amd(struct cpuinfo_x86 *c)
if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58))
set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
+
r = get_model_name(c);
if (!r) {
switch (c->x86) {
@@ -962,7 +982,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
cpuid(1, &eax, &ebx, &ecx, &edx);
- c->apicid = phys_pkg_id(0);
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
@@ -1032,7 +1051,7 @@ static void srat_detect_node(void)
for now. */
node = apicid_to_node[hard_smp_processor_id()];
if (node == NUMA_NO_NODE)
- node = 0;
+ node = first_node(node_online_map);
numa_set_node(cpu, node);
if (acpi_numa > 0)
@@ -1171,6 +1190,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
+ c->apicid = phys_pkg_id(0);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
@@ -1419,3 +1440,22 @@ struct seq_operations cpuinfo_op = {
.show = show_cpuinfo,
};
+#ifdef CONFIG_INPUT_PCSPKR
+#include <linux/platform_device.h>
+static __init int add_pcspkr(void)
+{
+ struct platform_device *pd;
+ int ret;
+
+ pd = platform_device_alloc("pcspkr", -1);
+ if (!pd)
+ return -ENOMEM;
+
+ ret = platform_device_add(pd);
+ if (ret)
+ platform_device_put(pd);
+
+ return ret;
+}
+device_initcall(add_pcspkr);
+#endif
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index eabdb63fec31..8a691fa6d393 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -55,7 +55,7 @@ int __init nonx_setup(char *str)
do_not_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
- return 0;
+ return 1;
}
__setup("noexec=", nonx_setup); /* parsed early actually */
@@ -74,7 +74,7 @@ static int __init nonx32_setup(char *str)
force_personality32 &= ~READ_IMPLIES_EXEC;
else if (!strcmp(str, "off"))
force_personality32 |= READ_IMPLIES_EXEC;
- return 0;
+ return 1;
}
__setup("noexec32=", nonx32_setup);
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e98659d077..71a7222cf9ce 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
/* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
@@ -350,7 +353,7 @@ static void __cpuinit tsc_sync_wait(void)
static __init int notscsync_setup(char *s)
{
notscsync = 1;
- return 0;
+ return 1;
}
__setup("notscsync", notscsync_setup);
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ /*
+ * For perf, we return last level cache shared map.
+ * TBD: when power saving sched policy is added, we will return
+ * cpu_core_map when power saving policy is enabled
+ */
+ return c->llc_shared_map;
+}
+
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
+ cpu_set(cpu, c[cpu].llc_shared_map);
+
if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
}
for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (cpu_llc_id[cpu] != BAD_APICID &&
+ cpu_llc_id[cpu] == cpu_llc_id[i]) {
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
+ }
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7f58fa682491..7392570f975d 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -504,42 +504,25 @@ unsigned long long sched_clock(void)
static unsigned long get_cmos_time(void)
{
- unsigned int timeout = 1000000, year, mon, day, hour, min, sec;
- unsigned char uip = 0, this = 0;
+ unsigned int year, mon, day, hour, min, sec;
unsigned long flags;
unsigned extyear = 0;
-/*
- * The Linux interpretation of the CMOS clock register contents: When the
- * Update-In-Progress (UIP) flag goes from 1 to 0, the RTC registers show the
- * second which has precisely just started. Waiting for this can take up to 1
- * second, we timeout approximately after 2.4 seconds on a machine with
- * standard 8.3 MHz ISA bus.
- */
-
spin_lock_irqsave(&rtc_lock, flags);
- while (timeout && (!uip || this)) {
- uip |= this;
- this = CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP;
- timeout--;
- }
-
- /*
- * Here we are safe to assume the registers won't change for a whole
- * second, so we just go ahead and read them.
- */
- sec = CMOS_READ(RTC_SECONDS);
- min = CMOS_READ(RTC_MINUTES);
- hour = CMOS_READ(RTC_HOURS);
- day = CMOS_READ(RTC_DAY_OF_MONTH);
- mon = CMOS_READ(RTC_MONTH);
- year = CMOS_READ(RTC_YEAR);
-
+ do {
+ sec = CMOS_READ(RTC_SECONDS);
+ min = CMOS_READ(RTC_MINUTES);
+ hour = CMOS_READ(RTC_HOURS);
+ day = CMOS_READ(RTC_DAY_OF_MONTH);
+ mon = CMOS_READ(RTC_MONTH);
+ year = CMOS_READ(RTC_YEAR);
#ifdef CONFIG_ACPI
- if (acpi_fadt.revision >= FADT2_REVISION_ID && acpi_fadt.century)
- extyear = CMOS_READ(acpi_fadt.century);
+ if (acpi_fadt.revision >= FADT2_REVISION_ID &&
+ acpi_fadt.century)
+ extyear = CMOS_READ(acpi_fadt.century);
#endif
+ } while (sec != CMOS_READ(RTC_SECONDS));
spin_unlock_irqrestore(&rtc_lock, flags);
@@ -743,7 +726,7 @@ static __init int late_hpet_init(void)
unsigned int ntimer;
if (!vxtime.hpet_address)
- return -1;
+ return 0;
memset(&hd, 0, sizeof (hd));
@@ -934,6 +917,8 @@ void __init time_init(void)
vxtime.hpet_address = 0;
if (hpet_use_timer) {
+ /* set tick_nsec to use the proper rate for HPET */
+ tick_nsec = TICK_NSEC_HPET;
cpu_khz = hpet_calibrate_tsc();
timename = "HPET";
#ifdef CONFIG_X86_PM_TIMER
@@ -1323,7 +1308,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
static int __init nohpet_setup(char *s)
{
nohpet = 1;
- return 0;
+ return 1;
}
__setup("nohpet", nohpet_setup);
@@ -1331,7 +1316,7 @@ __setup("nohpet", nohpet_setup);
int __init notsc_setup(char *s)
{
notsc = 1;
- return 0;
+ return 1;
}
__setup("notsc", notsc_setup);
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 7b148309c529..cea335e8746c 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -30,6 +30,7 @@
#include <linux/moduleparam.h>
#include <linux/nmi.h>
#include <linux/kprobes.h>
+#include <linux/kexec.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -69,20 +70,20 @@ asmlinkage void alignment_check(void);
asmlinkage void machine_check(void);
asmlinkage void spurious_interrupt_bug(void);
-struct notifier_block *die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(die_chain);
int register_die_notifier(struct notifier_block *nb)
{
- int err = 0;
- unsigned long flags;
-
vmalloc_sync_all();
- spin_lock_irqsave(&die_notifier_lock, flags);
- err = notifier_chain_register(&die_chain, nb);
- spin_unlock_irqrestore(&die_notifier_lock, flags);
- return err;
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
}
+EXPORT_SYMBOL(unregister_die_notifier);
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -101,6 +102,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
{
if (regs->eflags & X86_EFLAGS_IF)
local_irq_disable();
+ /* Make sure to not schedule here because we could be running
+ on an exception stack. */
preempt_enable_no_resched();
}
@@ -384,6 +387,7 @@ void out_of_line_bug(void)
static DEFINE_SPINLOCK(die_lock);
static int die_owner = -1;
+static unsigned int die_nest_count;
unsigned __kprobes long oops_begin(void)
{
@@ -398,6 +402,7 @@ unsigned __kprobes long oops_begin(void)
else
spin_lock(&die_lock);
}
+ die_nest_count++;
die_owner = cpu;
console_verbose();
bust_spinlocks(1);
@@ -408,7 +413,13 @@ void __kprobes oops_end(unsigned long flags)
{
die_owner = -1;
bust_spinlocks(0);
- spin_unlock_irqrestore(&die_lock, flags);
+ die_nest_count--;
+ if (die_nest_count)
+ /* We still own the lock */
+ local_irq_restore(flags);
+ else
+ /* Nest count reaches zero, release the lock. */
+ spin_unlock_irqrestore(&die_lock, flags);
if (panic_on_oops)
panic("Oops");
}
@@ -433,6 +444,8 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err)
printk(KERN_ALERT "RIP ");
printk_address(regs->rip);
printk(" RSP <%016lx>\n", regs->rsp);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
}
void die(const char * str, struct pt_regs * regs, long err)
@@ -455,10 +468,14 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
*/
printk(str, safe_smp_processor_id());
show_registers(regs);
+ if (kexec_should_crash(current))
+ crash_kexec(regs);
if (panic_on_timeout || panic_on_oops)
panic("nmi watchdog");
printk("console shuts up ...\n");
oops_end(flags);
+ nmi_exit();
+ local_irq_enable();
do_exit(SIGSEGV);
}
@@ -468,8 +485,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str,
{
struct task_struct *tsk = current;
- conditional_sti(regs);
-
tsk->thread.error_code = error_code;
tsk->thread.trap_no = trapnr;
@@ -506,6 +521,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
return; \
+ conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, NULL); \
}
@@ -520,6 +536,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
== NOTIFY_STOP) \
return; \
+ conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, &info); \
}
@@ -533,7 +550,17 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
DO_ERROR(18, SIGSEGV, "reserved", reserved)
-DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
+
+/* Runs on IST stack */
+asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+ if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+ 12, SIGBUS) == NOTIFY_STOP)
+ return;
+ preempt_conditional_sti(regs);
+ do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+}
asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
{
@@ -667,8 +694,9 @@ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code)
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) {
return;
}
+ preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
- return;
+ preempt_conditional_cli(regs);
}
/* Help handler running on IST stack to switch back to user stack
@@ -973,14 +1001,14 @@ void __init trap_init(void)
static int __init oops_dummy(char *s)
{
panic_on_oops = 1;
- return -1;
+ return 1;
}
__setup("oops=", oops_dummy);
static int __init kstack_setup(char *s)
{
kstack_depth_to_print = simple_strtoul(s,NULL,0);
- return 0;
+ return 1;
}
__setup("kstack=", kstack_setup);
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 39ff0708f803..b81f473c4a19 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
*(.data.cacheline_aligned)
}
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
*(.data.read_mostly)
}
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index d96a9348e5a2..1def21c9f7cd 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -102,8 +102,6 @@ EXPORT_SYMBOL(cpu_callout_map);
EXPORT_SYMBOL(screen_info);
#endif
-EXPORT_SYMBOL(get_wchan);
-
EXPORT_SYMBOL(rtc_lock);
EXPORT_SYMBOL_GPL(set_nmi_callback);
@@ -114,7 +112,6 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
#undef memcpy
#undef memset
#undef memmove
-#undef strlen
extern void * memset(void *,int,__kernel_size_t);
extern size_t strlen(const char *);
@@ -123,8 +120,6 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
extern void * __memcpy(void *,const void *,__kernel_size_t);
EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(strpbrk);
EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 316c53de47bd..55250593d8c9 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -623,6 +623,6 @@ void vmalloc_sync_all(void)
static int __init enable_pagefaulttrace(char *str)
{
page_fault_trace = 1;
- return 0;
+ return 1;
}
__setup("pagefaulttrace", enable_pagefaulttrace);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b04415625442..4ba34e95d835 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,7 +72,7 @@ void show_mem(void)
show_free_areas();
printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
- for_each_pgdat(pgdat) {
+ for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
page = pfn_to_page(pgdat->node_start_pfn + i);
total++;
@@ -305,7 +305,7 @@ static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned
if (paddr >= end)
break;
- if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0));
continue;
}
@@ -507,9 +507,8 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
/*
* Memory hotplug specific functions
- * These are only for non-NUMA machines right now.
*/
-#ifdef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
void online_page(struct page *page)
{
@@ -520,6 +519,38 @@ void online_page(struct page *page)
num_physpages++;
}
+#ifndef CONFIG_MEMORY_HOTPLUG
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+ int err = -EIO;
+ unsigned long pfn;
+ unsigned long total = 0, mem = 0;
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+ if (pfn_valid(pfn)) {
+ online_page(pfn_to_page(pfn));
+ err = 0;
+ mem++;
+ }
+ total++;
+ }
+ if (!err) {
+ z->spanned_pages += total;
+ z->present_pages += mem;
+ z->zone_pgdat->node_spanned_pages += total;
+ z->zone_pgdat->node_present_pages += mem;
+ }
+ return err;
+}
+#endif
+
+/*
+ * Memory is added always to NORMAL zone. This means you will never get
+ * additional DMA/DMA32 memory.
+ */
int add_memory(u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(0);
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 63c72641b737..b2fac14baac0 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -100,11 +100,30 @@ int early_pfn_to_nid(unsigned long pfn)
}
#endif
+static void * __init
+early_node_mem(int nodeid, unsigned long start, unsigned long end,
+ unsigned long size)
+{
+ unsigned long mem = find_e820_area(start, end, size);
+ void *ptr;
+ if (mem != -1L)
+ return __va(mem);
+ ptr = __alloc_bootmem_nopanic(size,
+ SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
+ if (ptr == 0) {
+ printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
+ size, nodeid);
+ return NULL;
+ }
+ return ptr;
+}
+
/* Initialize bootmem allocator for a node */
void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
{
unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
unsigned long nodedata_phys;
+ void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
start = round_up(start, ZONE_ALIGN);
@@ -114,13 +133,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
- nodedata_phys = find_e820_area(start, end, pgdat_size);
- if (nodedata_phys == -1L)
- panic("Cannot find memory pgdat in node %d\n", nodeid);
-
- Dprintk("nodedata_phys %lx\n", nodedata_phys);
+ node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+ if (node_data[nodeid] == NULL)
+ return;
+ nodedata_phys = __pa(node_data[nodeid]);
- node_data[nodeid] = phys_to_virt(nodedata_phys);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
@@ -129,9 +146,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
/* Find a place for the bootmem map */
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
- bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
- if (bootmap_start == -1L)
- panic("Not enough continuous space for bootmap on node %d", nodeid);
+ bootmap = early_node_mem(nodeid, bootmap_start, end,
+ bootmap_pages<<PAGE_SHIFT);
+ if (bootmap == NULL) {
+ if (nodedata_phys < start || nodedata_phys >= end)
+ free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
+ node_data[nodeid] = NULL;
+ return;
+ }
+ bootmap_start = __pa(bootmap);
Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
@@ -142,6 +165,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
+#ifdef CONFIG_ACPI_NUMA
+ srat_reserve_add_area(nodeid);
+#endif
node_set_online(nodeid);
}
@@ -162,11 +188,13 @@ void __init setup_node_zones(int nodeid)
memory. */
memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
limit = end_pfn << PAGE_SHIFT;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
NODE_DATA(nodeid)->node_mem_map =
__alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
memmapsize, SMP_CACHE_BYTES,
round_down(limit - memmapsize, PAGE_SIZE),
limit);
+#endif
size_zones(zones, holes, start_pfn, end_pfn);
free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
@@ -335,6 +363,8 @@ __init int numa_setup(char *opt)
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt,"noacpi",6))
acpi_numa = -1;
+ if (!strncmp(opt,"hotadd=", 7))
+ hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
return 1;
}
@@ -377,21 +407,6 @@ EXPORT_SYMBOL(node_data);
* Should do that.
*/
-/* Requires pfn_valid(pfn) to be true */
-struct page *pfn_to_page(unsigned long pfn)
-{
- int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
- return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
-}
-EXPORT_SYMBOL(pfn_to_page);
-
-unsigned long page_to_pfn(struct page *page)
-{
- return (long)(((page) - page_zone(page)->zone_mem_map) +
- page_zone(page)->zone_start_pfn);
-}
-EXPORT_SYMBOL(page_to_pfn);
-
int pfn_valid(unsigned long pfn)
{
unsigned nid;
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2eb879590dc4..474df22c6ed2 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -15,15 +15,29 @@
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
+#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
+ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
+ && !defined(CONFIG_MEMORY_HOTPLUG)
+#define RESERVE_HOTADD 1
+#endif
+
static struct acpi_table_slit *acpi_slit;
static nodemask_t nodes_parsed __initdata;
static nodemask_t nodes_found __initdata;
static struct bootnode nodes[MAX_NUMNODES] __initdata;
+static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
+static int found_add_area __initdata;
+int hotadd_percent __initdata = 0;
+#ifndef RESERVE_HOTADD
+#define hotadd_percent 0 /* Ignore all settings */
+#endif
static u8 pxm2node[256] = { [0 ... 255] = 0xff };
/* Too small nodes confuse the VM badly. Usually they result
@@ -71,6 +85,10 @@ static __init int conflicting_nodes(unsigned long start, unsigned long end)
static __init void cutoff_node(int i, unsigned long start, unsigned long end)
{
struct bootnode *nd = &nodes[i];
+
+ if (found_add_area)
+ return;
+
if (nd->start < start) {
nd->start = start;
if (nd->end < nd->start)
@@ -88,8 +106,11 @@ static __init void bad_srat(void)
int i;
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
+ found_add_area = 0;
for (i = 0; i < MAX_LOCAL_APIC; i++)
apicid_to_node[i] = NUMA_NO_NODE;
+ for (i = 0; i < MAX_NUMNODES; i++)
+ nodes_add[i].start = nodes[i].end = 0;
}
static __init inline int srat_disabled(void)
@@ -137,7 +158,8 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
int pxm, node;
if (srat_disabled())
return;
- if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat();
+ if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) {
+ bad_srat();
return;
}
if (pa->flags.enabled == 0)
@@ -155,11 +177,116 @@ acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
pxm, pa->apic_id, node);
}
+#ifdef RESERVE_HOTADD
+/*
+ * Protect against too large hotadd areas that would fill up memory.
+ */
+static int hotadd_enough_memory(struct bootnode *nd)
+{
+ static unsigned long allocated;
+ static unsigned long last_area_end;
+ unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
+ long mem = pages * sizeof(struct page);
+ unsigned long addr;
+ unsigned long allowed;
+ unsigned long oldpages = pages;
+
+ if (mem < 0)
+ return 0;
+ allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+ allowed = (allowed / 100) * hotadd_percent;
+ if (allocated + mem > allowed) {
+ unsigned long range;
+ /* Give them at least part of their hotadd memory upto hotadd_percent
+ It would be better to spread the limit out
+ over multiple hotplug areas, but that is too complicated
+ right now */
+ if (allocated >= allowed)
+ return 0;
+ range = allowed - allocated;
+ pages = (range / PAGE_SIZE);
+ mem = pages * sizeof(struct page);
+ nd->end = nd->start + range;
+ }
+ /* Not completely fool proof, but a good sanity check */
+ addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
+ if (addr == -1UL)
+ return 0;
+ if (pages != oldpages)
+ printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
+ pages << PAGE_SHIFT);
+ last_area_end = addr + mem;
+ allocated += mem;
+ return 1;
+}
+
+/*
+ * It is fine to add this area to the nodes data it will be used later
+ * This code supports one contigious hot add area per node.
+ */
+static int reserve_hotadd(int node, unsigned long start, unsigned long end)
+{
+ unsigned long s_pfn = start >> PAGE_SHIFT;
+ unsigned long e_pfn = end >> PAGE_SHIFT;
+ int changed = 0;
+ struct bootnode *nd = &nodes_add[node];
+
+ /* I had some trouble with strange memory hotadd regions breaking
+ the boot. Be very strict here and reject anything unexpected.
+ If you want working memory hotadd write correct SRATs.
+
+ The node size check is a basic sanity check to guard against
+ mistakes */
+ if ((signed long)(end - start) < NODE_MIN_SIZE) {
+ printk(KERN_ERR "SRAT: Hotplug area too small\n");
+ return -1;
+ }
+
+ /* This check might be a bit too strict, but I'm keeping it for now. */
+ if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
+ printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+ return -1;
+ }
+
+ if (!hotadd_enough_memory(&nodes_add[node])) {
+ printk(KERN_ERR "SRAT: Hotplug area too large\n");
+ return -1;
+ }
+
+ /* Looks good */
+
+ found_add_area = 1;
+ if (nd->start == nd->end) {
+ nd->start = start;
+ nd->end = end;
+ changed = 1;
+ } else {
+ if (nd->start == end) {
+ nd->start = start;
+ changed = 1;
+ }
+ if (nd->end == start) {
+ nd->end = end;
+ changed = 1;
+ }
+ if (!changed)
+ printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
+ }
+
+ if ((nd->end >> PAGE_SHIFT) > end_pfn)
+ end_pfn = nd->end >> PAGE_SHIFT;
+
+ if (changed)
+ printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
+ return 0;
+}
+#endif
+
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
void __init
acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
{
- struct bootnode *nd;
+ struct bootnode *nd, oldnode;
unsigned long start, end;
int node, pxm;
int i;
@@ -172,6 +299,8 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
}
if (ma->flags.enabled == 0)
return;
+ if (ma->flags.hot_pluggable && hotadd_percent == 0)
+ return;
start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
pxm = ma->proximity_domain;
@@ -181,10 +310,6 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
bad_srat();
return;
}
- /* It is fine to add this area to the nodes data it will be used later*/
- if (ma->flags.hot_pluggable == 1)
- printk(KERN_INFO "SRAT: hot plug zone found %lx - %lx \n",
- start, end);
i = conflicting_nodes(start, end);
if (i == node) {
printk(KERN_WARNING
@@ -199,6 +324,7 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
return;
}
nd = &nodes[node];
+ oldnode = *nd;
if (!node_test_and_set(node, nodes_parsed)) {
nd->start = start;
nd->end = end;
@@ -208,8 +334,19 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
if (nd->end < end)
nd->end = end;
}
+
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
+
+#ifdef RESERVE_HOTADD
+ if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
+ /* Ignore hotadd region. Undo damage */
+ printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+ *nd = oldnode;
+ if ((nd->start | nd->end) == 0)
+ node_clear(node, nodes_parsed);
+ }
+#endif
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -225,6 +362,9 @@ static int nodes_cover_memory(void)
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
pxmram -= e820_hole_size(s, e);
+ pxmram -= nodes_add[i].end - nodes_add[i].start;
+ if ((long)pxmram < 0)
+ pxmram = 0;
}
e820ram = end_pfn - e820_hole_size(0, end_pfn);
@@ -258,9 +398,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
- cutoff_node(i, start, end);
- if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
+ cutoff_node(i, start, end);
+ if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
unparse_node(i);
+ node_set_offline(i);
+ }
}
if (acpi_numa <= 0)
@@ -282,6 +424,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* Finally register nodes */
for_each_node_mask(i, nodes_parsed)
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ /* Try again in case setup_node_bootmem missed one due
+ to missing bootmem */
+ for_each_node_mask(i, nodes_parsed)
+ if (!node_online(i))
+ setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+
for (i = 0; i < NR_CPUS; i++) {
if (cpu_to_node[i] == NUMA_NO_NODE)
continue;
@@ -303,6 +451,25 @@ static int node_to_pxm(int n)
return 0;
}
+void __init srat_reserve_add_area(int nodeid)
+{
+ if (found_add_area && nodes_add[nodeid].end) {
+ u64 total_mb;
+
+ printk(KERN_INFO "SRAT: Reserving hot-add memory space "
+ "for node %d at %Lx-%Lx\n",
+ nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
+ total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
+ >> PAGE_SHIFT;
+ total_mb *= sizeof(struct page);
+ total_mb >>= 20;
+ printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
+ "pre-allocated memory.\n", (unsigned long long)total_mb);
+ reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
+ nodes_add[nodeid].end - nodes_add[nodeid].start);
+ }
+}
+
int __node_distance(int a, int b)
{
int index;
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index e616500207e4..a2060e4d5de6 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -9,11 +9,16 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/bitmap.h>
+#include <asm/e820.h>
+
#include "pci.h"
#define MMCONFIG_APER_SIZE (256*1024*1024)
+/* Verify the first 16 busses. We assume that systems with more busses
+ get MCFG right. */
+#define MAX_CHECK_BUS 16
-static DECLARE_BITMAP(fallback_slots, 32);
+static DECLARE_BITMAP(fallback_slots, 32*MAX_CHECK_BUS);
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
@@ -55,7 +60,8 @@ static char __iomem *get_virt(unsigned int seg, unsigned bus)
static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
{
char __iomem *addr;
- if (seg == 0 && bus == 0 && test_bit(PCI_SLOT(devfn), fallback_slots))
+ if (seg == 0 && bus < MAX_CHECK_BUS &&
+ test_bit(32*bus + PCI_SLOT(devfn), fallback_slots))
return NULL;
addr = get_virt(seg, bus);
if (!addr)
@@ -69,8 +75,10 @@ static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
char __iomem *addr;
/* Why do we have this when nobody checks it. How about a BUG()!? -AK */
- if (unlikely(!value || (bus > 255) || (devfn > 255) || (reg > 4095)))
+ if (unlikely((bus > 255) || (devfn > 255) || (reg > 4095))) {
+ *value = -1;
return -EINVAL;
+ }
addr = pci_dev_base(seg, bus, devfn);
if (!addr)
@@ -129,21 +137,26 @@ static struct pci_raw_ops pci_mmcfg = {
Normally this can be expressed in the MCFG by not listing them
and assigning suitable _SEGs, but this isn't implemented in some BIOS.
Instead try to discover all devices on bus 0 that are unreachable using MM
- and fallback for them.
- We only do this for bus 0/seg 0 */
+ and fallback for them. */
static __init void unreachable_devices(void)
{
- int i;
- for (i = 0; i < 32; i++) {
- u32 val1;
- char __iomem *addr;
-
- pci_conf1_read(0, 0, PCI_DEVFN(i,0), 0, 4, &val1);
- if (val1 == 0xffffffff)
- continue;
- addr = pci_dev_base(0, 0, PCI_DEVFN(i, 0));
- if (addr == NULL|| readl(addr) != val1) {
- set_bit(i, fallback_slots);
+ int i, k;
+ /* Use the max bus number from ACPI here? */
+ for (k = 0; k < MAX_CHECK_BUS; k++) {
+ for (i = 0; i < 32; i++) {
+ u32 val1;
+ char __iomem *addr;
+
+ pci_conf1_read(0, k, PCI_DEVFN(i,0), 0, 4, &val1);
+ if (val1 == 0xffffffff)
+ continue;
+ addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
+ if (addr == NULL|| readl(addr) != val1) {
+ set_bit(i + 32*k, fallback_slots);
+ printk(KERN_NOTICE
+ "PCI: No mmconfig possible on device %x:%x\n",
+ k, i);
+ }
}
}
}
@@ -161,6 +174,14 @@ void __init pci_mmcfg_init(void)
(pci_mmcfg_config[0].base_address == 0))
return;
+ if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
+ pci_mmcfg_config[0].base_address + MMCONFIG_APER_SIZE,
+ E820_RESERVED)) {
+ printk(KERN_ERR "PCI: BIOS Bug: MCFG area is not E820-reserved\n");
+ printk(KERN_ERR "PCI: Not using MMCONFIG.\n");
+ return;
+ }
+
/* RED-PEN i386 doesn't do _nocache right now */
pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {
OpenPOWER on IntegriCloud