summaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig120
-rw-r--r--arch/x86_64/Kconfig.debug22
-rw-r--r--arch/x86_64/Makefile14
-rw-r--r--arch/x86_64/boot/Makefile9
-rw-r--r--arch/x86_64/boot/compressed/Makefile3
-rw-r--r--arch/x86_64/boot/compressed/misc.c46
-rw-r--r--arch/x86_64/boot/setup.S7
-rw-r--r--arch/x86_64/boot/tools/build.c6
-rw-r--r--arch/x86_64/boot/video.S19
-rw-r--r--arch/x86_64/crypto/Makefile3
-rw-r--r--arch/x86_64/crypto/aes-x86_64-asm.S22
-rw-r--r--arch/x86_64/crypto/aes.c23
-rw-r--r--arch/x86_64/crypto/twofish-x86_64-asm.S324
-rw-r--r--arch/x86_64/crypto/twofish.c97
-rw-r--r--arch/x86_64/defconfig357
-rw-r--r--arch/x86_64/ia32/Makefile4
-rw-r--r--arch/x86_64/ia32/audit.c37
-rw-r--r--arch/x86_64/ia32/fpu32.c1
-rw-r--r--arch/x86_64/ia32/ia32_aout.c8
-rw-r--r--arch/x86_64/ia32/ia32_binfmt.c59
-rw-r--r--arch/x86_64/ia32/ia32_signal.c55
-rw-r--r--arch/x86_64/ia32/ia32entry.S41
-rw-r--r--arch/x86_64/ia32/ptrace32.c60
-rw-r--r--arch/x86_64/ia32/sys_ia32.c80
-rw-r--r--arch/x86_64/ia32/vsyscall.lds1
-rw-r--r--arch/x86_64/kernel/Makefile17
-rw-r--r--arch/x86_64/kernel/acpi/sleep.c1
-rw-r--r--arch/x86_64/kernel/aperture.c52
-rw-r--r--arch/x86_64/kernel/apic.c262
-rw-r--r--arch/x86_64/kernel/asm-offsets.c3
-rw-r--r--arch/x86_64/kernel/audit.c64
-rw-r--r--arch/x86_64/kernel/crash.c32
-rw-r--r--arch/x86_64/kernel/e820.c293
-rw-r--r--arch/x86_64/kernel/early-quirks.c122
-rw-r--r--arch/x86_64/kernel/early_printk.c22
-rw-r--r--arch/x86_64/kernel/entry.S376
-rw-r--r--arch/x86_64/kernel/functionlist1
-rw-r--r--arch/x86_64/kernel/genapic.c1
-rw-r--r--arch/x86_64/kernel/genapic_cluster.c2
-rw-r--r--arch/x86_64/kernel/genapic_flat.c36
-rw-r--r--arch/x86_64/kernel/head.S14
-rw-r--r--arch/x86_64/kernel/head64.c43
-rw-r--r--arch/x86_64/kernel/i387.c1
-rw-r--r--arch/x86_64/kernel/i8259.c38
-rw-r--r--arch/x86_64/kernel/init_task.c5
-rw-r--r--arch/x86_64/kernel/io_apic.c545
-rw-r--r--arch/x86_64/kernel/ioport.c1
-rw-r--r--arch/x86_64/kernel/irq.c62
-rw-r--r--arch/x86_64/kernel/k8.c118
-rw-r--r--arch/x86_64/kernel/kprobes.c1
-rw-r--r--arch/x86_64/kernel/machine_kexec.c114
-rw-r--r--arch/x86_64/kernel/mce.c39
-rw-r--r--arch/x86_64/kernel/mce_amd.c511
-rw-r--r--arch/x86_64/kernel/mce_intel.c30
-rw-r--r--arch/x86_64/kernel/module.c38
-rw-r--r--arch/x86_64/kernel/mpparse.c276
-rw-r--r--arch/x86_64/kernel/nmi.c840
-rw-r--r--arch/x86_64/kernel/pci-calgary.c1069
-rw-r--r--arch/x86_64/kernel/pci-dma.c153
-rw-r--r--arch/x86_64/kernel/pci-gart.c159
-rw-r--r--arch/x86_64/kernel/pci-nommu.c12
-rw-r--r--arch/x86_64/kernel/pci-swiotlb.c8
-rw-r--r--arch/x86_64/kernel/pmtimer.c2
-rw-r--r--arch/x86_64/kernel/process.c128
-rw-r--r--arch/x86_64/kernel/ptrace.c29
-rw-r--r--arch/x86_64/kernel/reboot.c1
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S171
-rw-r--r--arch/x86_64/kernel/setup.c541
-rw-r--r--arch/x86_64/kernel/setup64.c52
-rw-r--r--arch/x86_64/kernel/signal.c90
-rw-r--r--arch/x86_64/kernel/smp.c37
-rw-r--r--arch/x86_64/kernel/smpboot.c51
-rw-r--r--arch/x86_64/kernel/stacktrace.c55
-rw-r--r--arch/x86_64/kernel/suspend.c1
-rw-r--r--arch/x86_64/kernel/suspend_asm.S2
-rw-r--r--arch/x86_64/kernel/syscall.c1
-rw-r--r--arch/x86_64/kernel/tce.c194
-rw-r--r--arch/x86_64/kernel/time.c236
-rw-r--r--arch/x86_64/kernel/trampoline.S2
-rw-r--r--arch/x86_64/kernel/traps.c357
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S69
-rw-r--r--arch/x86_64/kernel/vsmp.c3
-rw-r--r--arch/x86_64/kernel/vsyscall.c102
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c115
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/clear_page.S47
-rw-r--r--arch/x86_64/lib/copy_page.S53
-rw-r--r--arch/x86_64/lib/copy_user.S153
-rw-r--r--arch/x86_64/lib/csum-copy.S26
-rw-r--r--arch/x86_64/lib/csum-partial.c1
-rw-r--r--arch/x86_64/lib/csum-wrappers.c1
-rw-r--r--arch/x86_64/lib/delay.c5
-rw-r--r--arch/x86_64/lib/getuser.S32
-rw-r--r--arch/x86_64/lib/iomap_copy.S10
-rw-r--r--arch/x86_64/lib/memcpy.S69
-rw-r--r--arch/x86_64/lib/memmove.c4
-rw-r--r--arch/x86_64/lib/memset.S79
-rw-r--r--arch/x86_64/lib/putuser.S32
-rw-r--r--arch/x86_64/lib/rwlock.S38
-rw-r--r--arch/x86_64/lib/thunk.S48
-rw-r--r--arch/x86_64/lib/usercopy.c13
-rw-r--r--arch/x86_64/mm/extable.c1
-rw-r--r--arch/x86_64/mm/fault.c67
-rw-r--r--arch/x86_64/mm/init.c258
-rw-r--r--arch/x86_64/mm/ioremap.c5
-rw-r--r--arch/x86_64/mm/k8topology.c6
-rw-r--r--arch/x86_64/mm/mmap.c1
-rw-r--r--arch/x86_64/mm/numa.c32
-rw-r--r--arch/x86_64/mm/pageattr.c25
-rw-r--r--arch/x86_64/mm/srat.c19
-rw-r--r--arch/x86_64/pci/Makefile3
-rw-r--r--arch/x86_64/pci/mmconfig.c44
112 files changed, 6531 insertions, 3561 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index af44130f0d65..32ae1378f35c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,18 @@ config X86
bool
default y
+config ZONE_DMA32
+ bool
+ default y
+
+config LOCKDEP_SUPPORT
+ bool
+ default y
+
+config STACKTRACE_SUPPORT
+ bool
+ default y
+
config SEMAPHORE_SLEEPERS
bool
default y
@@ -73,10 +85,17 @@ config ARCH_MAY_HAVE_PC_FDC
bool
default y
+config ARCH_POPULATES_NODE_MAP
+ def_bool y
+
config DMI
bool
default y
+config AUDIT_ARCH
+ bool
+ default y
+
source "init/Kconfig"
@@ -93,6 +112,7 @@ config X86_PC
config X86_VSMP
bool "Support for ScaleMP vSMP"
+ depends on PCI
help
Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
supposed to run on these EM64T-based machines. Only choose this option
@@ -151,6 +171,7 @@ config X86_GOOD_APIC
config MICROCODE
tristate "/dev/cpu/microcode - Intel CPU microcode support"
+ select FW_LOADER
---help---
If you say Y here the 'File systems' section, you will be
able to update the microcode on Intel processors. You will
@@ -166,6 +187,11 @@ config MICROCODE
If you use modprobe or kmod you may also want to add the line
'alias char-major-10-184 microcode' to your /etc/modules.conf file.
+config MICROCODE_OLD_INTERFACE
+ bool
+ depends on MICROCODE
+ default y
+
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
help
@@ -279,7 +305,7 @@ config NUMA
config K8_NUMA
bool "Old style AMD Opteron NUMA detection"
- depends on NUMA
+ depends on NUMA && PCI
default y
help
Enable K8 NUMA node topology detection. You should say Y here if
@@ -370,6 +396,8 @@ config HOTPLUG_CPU
can be controlled through /sys/devices/system/cpu/cpu#.
Say N if you want to disable CPU hotplug.
+config ARCH_ENABLE_MEMORY_HOTPLUG
+ def_bool y
config HPET_TIMER
bool
@@ -386,24 +414,44 @@ config HPET_EMULATE_RTC
bool "Provide RTC interrupt"
depends on HPET_TIMER && RTC=y
-config GART_IOMMU
- bool "K8 GART IOMMU support"
+# Mark as embedded because too many people got it wrong.
+# The code disables itself when not needed.
+config IOMMU
+ bool "IOMMU support" if EMBEDDED
default y
select SWIOTLB
select AGP
depends on PCI
help
- Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors
- and for the bounce buffering software IOMMU.
- Needed to run systems with more than 3GB of memory properly with
- 32-bit PCI devices that do not support DAC (Double Address Cycle).
- The IOMMU can be turned off at runtime with the iommu=off parameter.
- Normally the kernel will take the right choice by itself.
- This option includes a driver for the AMD Opteron/Athlon64 IOMMU
- northbridge and a software emulation used on other systems without
- hardware IOMMU. If unsure, say Y.
-
-# need this always selected by GART_IOMMU for the VIA workaround
+ Support for full DMA access of devices with 32bit memory access only
+ on systems with more than 3GB. This is usually needed for USB,
+ sound, many IDE/SATA chipsets and some other devices.
+ Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART
+ based IOMMU and a software bounce buffer based IOMMU used on Intel
+ systems and as fallback.
+ The code is only active when needed (enough memory and limited
+ device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified
+ too.
+
+config CALGARY_IOMMU
+ bool "IBM Calgary IOMMU support"
+ select SWIOTLB
+ depends on PCI && EXPERIMENTAL
+ help
+ Support for hardware IOMMUs in IBM's xSeries x366 and x460
+ systems. Needed to run systems with more than 3GB of memory
+ properly with 32-bit PCI devices that do not support DAC
+ (Double Address Cycle). Calgary also supports bus level
+ isolation, where all DMAs pass through the IOMMU. This
+ prevents them from going anywhere except their intended
+ destination. This catches hard-to-find kernel bugs and
+ mis-behaving drivers and devices that do not use the DMA-API
+ properly to set up their DMA buffers. The IOMMU can be
+ turned off at boot time with the iommu=off parameter.
+ Normally the kernel will make the right choice by itself.
+ If unsure, say Y.
+
+# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
bool
@@ -433,15 +481,14 @@ config X86_MCE_AMD
the DRAM Error Threshold.
config KEXEC
- bool "kexec system call (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ bool "kexec system call"
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
- but it is indepedent of the system firmware. And like a reboot
+ but it is independent of the system firmware. And like a reboot
you can start any kernel with it, not just Linux.
- The name comes from the similiarity to the exec system call.
+ The name comes from the similarity to the exec system call.
It is an ongoing process to be certain the hardware in a machine
is properly shutdown, so do not be surprised if this code does not
@@ -453,7 +500,14 @@ config CRASH_DUMP
bool "kernel crash dumps (EXPERIMENTAL)"
depends on EXPERIMENTAL
help
- Generate crash dump after being started by kexec.
+ Generate crash dump after being started by kexec.
+ This should be normally only set in special crash dump kernels
+ which are loaded in the main kernel with kexec-tools into
+ a specially reserved region and then later executed after
+ a crash by kdump/kexec. The crash dump kernel must be compiled
+ to a memory address not used by the main kernel or BIOS using
+ PHYSICAL_START.
+ For more details see Documentation/kdump/kdump.txt
config PHYSICAL_START
hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
@@ -491,6 +545,30 @@ config SECCOMP
If unsure, say Y. Only embedded should say N here.
+config CC_STACKPROTECTOR
+ bool "Enable -fstack-protector buffer overflow detection (EXPRIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ This option turns on the -fstack-protector GCC feature. This
+ feature puts, at the beginning of critical functions, a canary
+ value on the stack just before the return address, and validates
+ the value just before actually returning. Stack based buffer
+ overflows (that need to overwrite this return address) now also
+ overwrite the canary, which gets detected and the attack is then
+ neutralized via a kernel panic.
+
+ This feature requires gcc version 4.2 or above, or a distribution
+ gcc with the feature backported. Older versions are automatically
+ detected and for those versions, this configuration option is ignored.
+
+config CC_STACKPROTECTOR_ALL
+ bool "Use stack-protector for all functions"
+ depends on CC_STACKPROTECTOR
+ help
+ Normally, GCC only inserts the canary value protection for
+ functions that use large-ish on-stack buffers. By enabling
+ this option, GCC will be asked to do this for ALL functions.
+
source kernel/Kconfig.hz
config REORDER
@@ -501,6 +579,10 @@ config REORDER
optimal TLB usage. If you have pretty much any version of binutils,
this can increase your kernel build time by roughly one minute.
+config K8_NB
+ def_bool y
+ depends on AGP_AMD64 || IOMMU || (PCI && NUMA)
+
endmenu
#
diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug
index ea31b4c62105..775d211a5cf9 100644
--- a/arch/x86_64/Kconfig.debug
+++ b/arch/x86_64/Kconfig.debug
@@ -1,5 +1,9 @@
menu "Kernel hacking"
+config TRACE_IRQFLAGS_SUPPORT
+ bool
+ default y
+
source "lib/Kconfig.debug"
config DEBUG_RODATA
@@ -13,7 +17,7 @@ config DEBUG_RODATA
If in doubt, say "N".
config IOMMU_DEBUG
- depends on GART_IOMMU && DEBUG_KERNEL
+ depends on IOMMU && DEBUG_KERNEL
bool "Enable IOMMU debugging"
help
Force the IOMMU to on even when you have less than 4GB of
@@ -35,6 +39,22 @@ config IOMMU_LEAK
Add a simple leak tracer to the IOMMU code. This is useful when you
are debugging a buggy device driver that leaks IOMMU mappings.
+config DEBUG_STACKOVERFLOW
+ bool "Check for stack overflows"
+ depends on DEBUG_KERNEL
+ help
+ This option will cause messages to be printed if free stack space
+ drops below a certain limit.
+
+config DEBUG_STACK_USAGE
+ bool "Stack utilization instrumentation"
+ depends on DEBUG_KERNEL
+ help
+ Enables the display of the minimum amount of free stack which each
+ task has ever had available in the sysrq-T and sysrq-P debug output.
+
+ This option will slow down process creation somewhat.
+
#config X86_REMOTE_DEBUG
# bool "kgdb debugging stub"
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index e573e2ab5510..1c0f18d4f887 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -27,6 +27,7 @@ LDFLAGS_vmlinux :=
CHECKFLAGS += -D__x86_64__ -m64
cflags-y :=
+cflags-kernel-y :=
cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
@@ -35,7 +36,7 @@ cflags-y += -m64
cflags-y += -mno-red-zone
cflags-y += -mcmodel=kernel
cflags-y += -pipe
-cflags-$(CONFIG_REORDER) += -ffunction-sections
+cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections
# this makes reading assembly source easier, but produces worse code
# actually it makes the kernel smaller too.
cflags-y += -fno-reorder-blocks
@@ -53,8 +54,19 @@ endif
cflags-y += $(call cc-option,-funit-at-a-time)
# prevent gcc from generating any FP code by mistake
cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+# do binutils support CFI?
+cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
+AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,)
+
+# is .cfi_signal_frame supported too?
+cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
+AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,)
+
+cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector )
+cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all )
CFLAGS += $(cflags-y)
+CFLAGS_KERNEL += $(cflags-kernel-y)
AFLAGS += -m64
head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
index 43ee6c50c277..deb063e7762d 100644
--- a/arch/x86_64/boot/Makefile
+++ b/arch/x86_64/boot/Makefile
@@ -107,8 +107,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf
isoimage: $(BOOTIMAGE)
-rm -rf $(obj)/isoimage
mkdir $(obj)/isoimage
- cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \
- $(obj)/isoimage
+ for i in lib lib64 share end ; do \
+ if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
+ cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
+ break ; \
+ fi ; \
+ if [ $$i = end ] ; then exit 1 ; fi ; \
+ done
cp $(BOOTIMAGE) $(obj)/isoimage/linux
echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
if [ -f '$(FDINITRD)' ] ; then \
diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile
index f89d96f11a9f..e70fa6e1da08 100644
--- a/arch/x86_64/boot/compressed/Makefile
+++ b/arch/x86_64/boot/compressed/Makefile
@@ -7,7 +7,8 @@
#
targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o
-EXTRA_AFLAGS := -traditional -m32
+EXTRA_AFLAGS := -traditional
+AFLAGS := $(subst -m64,-m32,$(AFLAGS))
# cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with
# -m32
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index cf4b88c416dc..3755b2e394d0 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -77,11 +77,11 @@ static void gzip_release(void **);
*/
static unsigned char *real_mode; /* Pointer to real-mode data */
-#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
+#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2))
#ifndef STANDARD_MEMORY_BIOS_CALL
-#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
+#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0))
#endif
-#define SCREEN_INFO (*(struct screen_info *)(real_mode+0))
+#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))
extern unsigned char input_data[];
extern int input_len;
@@ -92,9 +92,9 @@ static unsigned long output_ptr = 0;
static void *malloc(int size);
static void free(void *where);
-
-void* memset(void* s, int c, unsigned n);
-void* memcpy(void* dest, const void* src, unsigned n);
+
+static void *memset(void *s, int c, unsigned n);
+static void *memcpy(void *dest, const void *src, unsigned n);
static void putstr(const char *);
@@ -162,8 +162,8 @@ static void putstr(const char *s)
int x,y,pos;
char c;
- x = SCREEN_INFO.orig_x;
- y = SCREEN_INFO.orig_y;
+ x = RM_SCREEN_INFO.orig_x;
+ y = RM_SCREEN_INFO.orig_y;
while ( ( c = *s++ ) != '\0' ) {
if ( c == '\n' ) {
@@ -184,8 +184,8 @@ static void putstr(const char *s)
}
}
- SCREEN_INFO.orig_x = x;
- SCREEN_INFO.orig_y = y;
+ RM_SCREEN_INFO.orig_x = x;
+ RM_SCREEN_INFO.orig_y = y;
pos = (x + cols * y) * 2; /* Update cursor position */
outb_p(14, vidport);
@@ -194,7 +194,7 @@ static void putstr(const char *s)
outb_p(0xff & (pos >> 1), vidport+1);
}
-void* memset(void* s, int c, unsigned n)
+static void* memset(void* s, int c, unsigned n)
{
int i;
char *ss = (char*)s;
@@ -203,7 +203,7 @@ void* memset(void* s, int c, unsigned n)
return s;
}
-void* memcpy(void* dest, const void* src, unsigned n)
+static void* memcpy(void* dest, const void* src, unsigned n)
{
int i;
char *d = (char *)dest, *s = (char *)src;
@@ -278,15 +278,15 @@ static void error(char *x)
putstr(x);
putstr("\n\n -- System halted");
- while(1);
+ while(1); /* Halt */
}
-void setup_normal_output_buffer(void)
+static void setup_normal_output_buffer(void)
{
#ifdef STANDARD_MEMORY_BIOS_CALL
- if (EXT_MEM_K < 1024) error("Less than 2MB of memory");
+ if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");
#else
- if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
+ if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");
#endif
output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */
free_mem_end_ptr = (long)real_mode;
@@ -297,13 +297,13 @@ struct moveparams {
uch *high_buffer_start; int hcount;
};
-void setup_output_buffer_if_we_run_high(struct moveparams *mv)
+static void setup_output_buffer_if_we_run_high(struct moveparams *mv)
{
high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);
#ifdef STANDARD_MEMORY_BIOS_CALL
- if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
+ if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");
#else
- if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
+ if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");
#endif
mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;
low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX
@@ -319,7 +319,7 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
mv->high_buffer_start = high_buffer_start;
}
-void close_output_buffer_if_we_run_high(struct moveparams *mv)
+static void close_output_buffer_if_we_run_high(struct moveparams *mv)
{
if (bytes_out > low_buffer_size) {
mv->lcount = low_buffer_size;
@@ -335,7 +335,7 @@ int decompress_kernel(struct moveparams *mv, void *rmode)
{
real_mode = rmode;
- if (SCREEN_INFO.orig_video_mode == 7) {
+ if (RM_SCREEN_INFO.orig_video_mode == 7) {
vidmem = (char *) 0xb0000;
vidport = 0x3b4;
} else {
@@ -343,8 +343,8 @@ int decompress_kernel(struct moveparams *mv, void *rmode)
vidport = 0x3d4;
}
- lines = SCREEN_INFO.orig_video_lines;
- cols = SCREEN_INFO.orig_video_cols;
+ lines = RM_SCREEN_INFO.orig_video_lines;
+ cols = RM_SCREEN_INFO.orig_video_cols;
if (free_mem_ptr < 0x100000) setup_normal_output_buffer();
else setup_output_buffer_if_we_run_high(mv);
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 12ea0b6c52e2..c3bfd223ab49 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -45,9 +45,8 @@
* Added long mode checking and SSE force. March 2003, Andi Kleen.
*/
-#include <linux/config.h>
#include <asm/segment.h>
-#include <linux/version.h>
+#include <linux/utsrelease.h>
#include <linux/compile.h>
#include <asm/boot.h>
#include <asm/e820.h>
@@ -527,12 +526,12 @@ is_disk1:
movw %cs, %ax # aka SETUPSEG
subw $DELTA_INITSEG, %ax # aka INITSEG
movw %ax, %ds
- movw $0, (0x1ff) # default is no pointing device
+ movb $0, (0x1ff) # default is no pointing device
int $0x11 # int 0x11: equipment list
testb $0x04, %al # check if mouse installed
jz no_psmouse
- movw $0xAA, (0x1ff) # device present
+ movb $0xAA, (0x1ff) # device present
no_psmouse:
#include "../../i386/boot/edd.S"
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index c44f5e2ec100..eae86691709a 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -149,10 +149,8 @@ int main(int argc, char ** argv)
sz = sb.st_size;
fprintf (stderr, "System is %d kB\n", sz/1024);
sys_size = (sz + 15) / 16;
- /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */
- if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE))
- die("System is too big. Try using %smodules.",
- is_big_kernel ? "" : "bzImage or ");
+ if (!is_big_kernel && sys_size > DEF_SYSSIZE)
+ die("System is too big. Try using bzImage or modules.");
while (sz > 0) {
int l, n;
diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S
index 32327bb37aff..2aa565c136e5 100644
--- a/arch/x86_64/boot/video.S
+++ b/arch/x86_64/boot/video.S
@@ -1929,6 +1929,7 @@ skip10: movb %ah, %al
ret
store_edid:
+#ifdef CONFIG_FIRMWARE_EDID
pushw %es # just save all registers
pushw %ax
pushw %bx
@@ -1946,6 +1947,22 @@ store_edid:
rep
stosl
+ pushw %es # save ES
+ xorw %di, %di # Report Capability
+ pushw %di
+ popw %es # ES:DI must be 0:0
+ movw $0x4f15, %ax
+ xorw %bx, %bx
+ xorw %cx, %cx
+ int $0x10
+ popw %es # restore ES
+
+ cmpb $0x00, %ah # call successful
+ jne no_edid
+
+ cmpb $0x4f, %al # function supported
+ jne no_edid
+
movw $0x4f15, %ax # do VBE/DDC
movw $0x01, %bx
movw $0x00, %cx
@@ -1953,12 +1970,14 @@ store_edid:
movw $0x140, %di
int $0x10
+no_edid:
popw %di # restore all registers
popw %dx
popw %cx
popw %bx
popw %ax
popw %es
+#endif
ret
# VIDEO_SELECT-only variables
diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile
index 426d20f4b72e..15b538a8b7f7 100644
--- a/arch/x86_64/crypto/Makefile
+++ b/arch/x86_64/crypto/Makefile
@@ -5,5 +5,8 @@
#
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
+
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S
index 483cbb23ab8d..26b40de4d0b0 100644
--- a/arch/x86_64/crypto/aes-x86_64-asm.S
+++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -15,6 +15,10 @@
.text
+#include <asm/asm-offsets.h>
+
+#define BASE crypto_tfm_ctx_offset
+
#define R1 %rax
#define R1E %eax
#define R1X %ax
@@ -46,19 +50,19 @@
#define R10 %r10
#define R11 %r11
-#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
.global FUNC; \
.type FUNC,@function; \
.align 8; \
FUNC: movq r1,r2; \
movq r3,r4; \
- leaq BASE+52(r8),r9; \
+ leaq BASE+KEY+52(r8),r9; \
movq r10,r11; \
movl (r7),r5 ## E; \
movl 4(r7),r1 ## E; \
movl 8(r7),r6 ## E; \
movl 12(r7),r7 ## E; \
- movl (r8),r10 ## E; \
+ movl BASE(r8),r10 ## E; \
xorl -48(r9),r5 ## E; \
xorl -44(r9),r1 ## E; \
xorl -40(r9),r6 ## E; \
@@ -128,8 +132,8 @@ FUNC: movq r1,r2; \
movl r3 ## E,r1 ## E; \
movl r4 ## E,r2 ## E;
-#define entry(FUNC,BASE,B128,B192) \
- prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+#define entry(FUNC,KEY,B128,B192) \
+ prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
@@ -147,9 +151,9 @@ FUNC: movq r1,r2; \
#define decrypt_final(TAB,OFFSET) \
round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
-/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */
+/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_encrypt,0,enc128,enc192)
+ entry(aes_enc_blk,0,enc128,enc192)
encrypt_round(aes_ft_tab,-96)
encrypt_round(aes_ft_tab,-80)
enc192: encrypt_round(aes_ft_tab,-64)
@@ -166,9 +170,9 @@ enc128: encrypt_round(aes_ft_tab,-32)
encrypt_final(aes_fl_tab,112)
return
-/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */
+/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
- entry(aes_decrypt,240,dec128,dec192)
+ entry(aes_dec_blk,240,dec128,dec192)
decrypt_round(aes_it_tab,-96)
decrypt_round(aes_it_tab,-80)
dec192: decrypt_round(aes_it_tab,-64)
diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c
index 6f77e7700d32..5cdb13ea5cc2 100644
--- a/arch/x86_64/crypto/aes.c
+++ b/arch/x86_64/crypto/aes.c
@@ -227,14 +227,15 @@ static void __init gen_tabs(void)
t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \
}
-static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
- u32 *flags)
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+ unsigned int key_len)
{
- struct aes_ctx *ctx = ctx_arg;
+ struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
const __le32 *key = (const __le32 *)in_key;
+ u32 *flags = &tfm->crt_flags;
u32 i, j, t, u, v, w;
- if (key_len != 16 && key_len != 24 && key_len != 32) {
+ if (key_len % 8) {
*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
@@ -283,8 +284,18 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,
return 0;
}
-extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in);
-extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in);
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ aes_enc_blk(tfm, dst, src);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ aes_dec_blk(tfm, dst, src);
+}
static struct crypto_alg aes_alg = {
.cra_name = "aes",
diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S
new file mode 100644
index 000000000000..35974a586615
--- /dev/null
+++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
+* *
+* This program is free software; you can redistribute it and/or modify *
+* it under the terms of the GNU General Public License as published by *
+* the Free Software Foundation; either version 2 of the License, or *
+* (at your option) any later version. *
+* *
+* This program is distributed in the hope that it will be useful, *
+* but WITHOUT ANY WARRANTY; without even the implied warranty of *
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+* GNU General Public License for more details. *
+* *
+* You should have received a copy of the GNU General Public License *
+* along with this program; if not, write to the *
+* Free Software Foundation, Inc., *
+* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#include <asm/asm-offsets.h>
+
+#define a_offset 0
+#define b_offset 4
+#define c_offset 8
+#define d_offset 12
+
+/* Structure of the crypto context struct*/
+
+#define s0 0 /* S0 Array 256 Words each */
+#define s1 1024 /* S1 Array */
+#define s2 2048 /* S2 Array */
+#define s3 3072 /* S3 Array */
+#define w 4096 /* 8 whitening keys (word) */
+#define k 4128 /* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0 %rax
+#define R0D %eax
+#define R0B %al
+#define R0H %ah
+
+#define R1 %rbx
+#define R1D %ebx
+#define R1B %bl
+#define R1H %bh
+
+#define R2 %rcx
+#define R2D %ecx
+#define R2B %cl
+#define R2H %ch
+
+#define R3 %rdx
+#define R3D %edx
+#define R3B %dl
+#define R3H %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+ xor w+offset(context), src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+ xor w+16+offset(context), src;
+
+
+/*
+ * a input register containing a (rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ */
+#define encrypt_round(a,b,c,d,round)\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ movzx b ## H, %edi;\
+ ror $15, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ rol $15, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;
+
+/*
+ * a input register containing a(rotated 16)
+ * b input register containing b
+ * c input register containing c
+ * d input register containing d (already rol $1)
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define encrypt_last_round(a,b,c,d,round)\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ movzx b ## B, %edi;\
+ mov s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ mov s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s3(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor (%r11,%rdi,4), %r9d;\
+ xor a, %r10;\
+ movzx b ## H, %edi;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ ror $1, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D
+
+/*
+ * a input register containing a
+ * b input register containing b (rotated 16)
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ */
+#define decrypt_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ ror $15, a ## D;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ rol $15, d ## D;
+
+/*
+ * a input register containing a
+ * b input register containing b
+ * c input register containing c (already rol $1)
+ * d input register containing d
+ * operations on a and b are interleaved to increase performance
+ * during the round a and b are prepared for the output whitening
+ */
+#define decrypt_last_round(a,b,c,d,round)\
+ movzx a ## B, %edi;\
+ mov (%r11,%rdi,4), %r9d;\
+ movzx b ## B, %edi;\
+ mov s3(%r11,%rdi,4),%r8d;\
+ movzx b ## H, %edi;\
+ ror $16, b ## D;\
+ xor (%r11,%rdi,4), %r8d;\
+ movzx a ## H, %edi;\
+ mov b ## D, %r10d;\
+ shl $32, %r10;\
+ xor a, %r10;\
+ ror $16, a ## D;\
+ xor s1(%r11,%rdi,4),%r9d;\
+ movzx b ## B, %edi;\
+ xor s1(%r11,%rdi,4),%r8d;\
+ movzx a ## B, %edi;\
+ xor s2(%r11,%rdi,4),%r9d;\
+ movzx b ## H, %edi;\
+ xor s2(%r11,%rdi,4),%r8d;\
+ movzx a ## H, %edi;\
+ xor s3(%r11,%rdi,4),%r9d;\
+ add %r8d, %r9d;\
+ add %r9d, %r8d;\
+ add k+round(%r11), %r9d;\
+ xor %r9d, c ## D;\
+ add k+4+round(%r11),%r8d;\
+ xor %r8d, d ## D;\
+ ror $1, d ## D;
+
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+twofish_enc_blk:
+ pushq R1
+
+ /* %rdi contains the crypto tfm adress */
+ /* %rsi contains the output adress */
+ /* %rdx contains the input adress */
+ add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
+ /* ctx adress is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ input_whitening(R1,%r11,a_offset)
+ input_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ rol $16, R0D
+ shr $32, R1
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R3D
+
+ encrypt_round(R0,R1,R2,R3,0);
+ encrypt_round(R2,R3,R0,R1,8);
+ encrypt_round(R0,R1,R2,R3,2*8);
+ encrypt_round(R2,R3,R0,R1,3*8);
+ encrypt_round(R0,R1,R2,R3,4*8);
+ encrypt_round(R2,R3,R0,R1,5*8);
+ encrypt_round(R0,R1,R2,R3,6*8);
+ encrypt_round(R2,R3,R0,R1,7*8);
+ encrypt_round(R0,R1,R2,R3,8*8);
+ encrypt_round(R2,R3,R0,R1,9*8);
+ encrypt_round(R0,R1,R2,R3,10*8);
+ encrypt_round(R2,R3,R0,R1,11*8);
+ encrypt_round(R0,R1,R2,R3,12*8);
+ encrypt_round(R2,R3,R0,R1,13*8);
+ encrypt_round(R0,R1,R2,R3,14*8);
+ encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+ output_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ output_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movq $1,%rax
+ ret
+
+twofish_dec_blk:
+ pushq R1
+
+ /* %rdi contains the crypto tfm adress */
+ /* %rsi contains the output adress */
+ /* %rdx contains the input adress */
+ add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */
+ /* ctx adress is moved to free one non-rex register
+ as target for the 8bit high operations */
+ mov %rdi, %r11
+
+ movq (R3), R1
+ movq 8(R3), R3
+ output_whitening(R1,%r11,a_offset)
+ output_whitening(R3,%r11,c_offset)
+ mov R1D, R0D
+ shr $32, R1
+ rol $16, R1D
+ mov R3D, R2D
+ shr $32, R3
+ rol $1, R2D
+
+ decrypt_round(R0,R1,R2,R3,15*8);
+ decrypt_round(R2,R3,R0,R1,14*8);
+ decrypt_round(R0,R1,R2,R3,13*8);
+ decrypt_round(R2,R3,R0,R1,12*8);
+ decrypt_round(R0,R1,R2,R3,11*8);
+ decrypt_round(R2,R3,R0,R1,10*8);
+ decrypt_round(R0,R1,R2,R3,9*8);
+ decrypt_round(R2,R3,R0,R1,8*8);
+ decrypt_round(R0,R1,R2,R3,7*8);
+ decrypt_round(R2,R3,R0,R1,6*8);
+ decrypt_round(R0,R1,R2,R3,5*8);
+ decrypt_round(R2,R3,R0,R1,4*8);
+ decrypt_round(R0,R1,R2,R3,3*8);
+ decrypt_round(R2,R3,R0,R1,2*8);
+ decrypt_round(R0,R1,R2,R3,1*8);
+ decrypt_last_round(R2,R3,R0,R1,0);
+
+ input_whitening(%r10,%r11,a_offset)
+ movq %r10, (%rsi)
+
+ shl $32, R1
+ xor R0, R1
+
+ input_whitening(R1,%r11,c_offset)
+ movq R1, 8(%rsi)
+
+ popq R1
+ movq $1,%rax
+ ret
diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c
new file mode 100644
index 000000000000..182d91d5cfb9
--- /dev/null
+++ b/arch/x86_64/crypto/twofish.c
@@ -0,0 +1,97 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <crypto/twofish.h>
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_enc_blk(tfm, dst, src);
+}
+
+static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+ twofish_dec_blk(tfm, dst, src);
+}
+
+static struct crypto_alg alg = {
+ .cra_name = "twofish",
+ .cra_driver_name = "twofish-x86_64",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_blocksize = TF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct twofish_ctx),
+ .cra_alignmask = 3,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(alg.cra_list),
+ .cra_u = {
+ .cipher = {
+ .cia_min_keysize = TF_MIN_KEY_SIZE,
+ .cia_max_keysize = TF_MAX_KEY_SIZE,
+ .cia_setkey = twofish_setkey,
+ .cia_encrypt = twofish_encrypt,
+ .cia_decrypt = twofish_decrypt
+ }
+ }
+};
+
+static int __init init(void)
+{
+ return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+ crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
+MODULE_ALIAS("twofish");
diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index 69db0c0721d1..4844b543bed0 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -1,11 +1,14 @@
#
# Automatically generated make config: don't edit
-# Linux kernel version: 2.6.17-rc1-git11
-# Sun Apr 16 07:22:36 2006
+# Linux kernel version: 2.6.18-git7
+# Wed Sep 27 21:53:10 2006
#
CONFIG_X86_64=y
CONFIG_64BIT=y
CONFIG_X86=y
+CONFIG_ZONE_DMA32=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_STACKTRACE_SUPPORT=y
CONFIG_SEMAPHORE_SLEEPERS=y
CONFIG_MMU=y
CONFIG_RWSEM_GENERIC_SPINLOCK=y
@@ -17,6 +20,8 @@ CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_IOMAP=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_DMI=y
+CONFIG_AUDIT_ARCH=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
#
# Code maturity level options
@@ -34,17 +39,17 @@ CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_BSD_PROCESS_ACCT is not set
-CONFIG_SYSCTL=y
+# CONFIG_TASKSTATS is not set
# CONFIG_AUDIT is not set
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
# CONFIG_CPUSETS is not set
# CONFIG_RELAY is not set
CONFIG_INITRAMFS_SOURCE=""
-CONFIG_UID16=y
-CONFIG_VM86=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
# CONFIG_EMBEDDED is not set
+CONFIG_UID16=y
+CONFIG_SYSCTL=y
CONFIG_KALLSYMS=y
CONFIG_KALLSYMS_ALL=y
# CONFIG_KALLSYMS_EXTRA_PASS is not set
@@ -57,7 +62,8 @@ CONFIG_FUTEX=y
CONFIG_EPOLL=y
CONFIG_SHMEM=y
CONFIG_SLAB=y
-CONFIG_DOUBLEFAULT=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_RT_MUTEXES=y
# CONFIG_TINY_SHMEM is not set
CONFIG_BASE_SMALL=0
# CONFIG_SLOB is not set
@@ -138,13 +144,16 @@ CONFIG_NEED_MULTIPLE_NODES=y
# CONFIG_SPARSEMEM_STATIC is not set
CONFIG_SPLIT_PTLOCK_CPUS=4
CONFIG_MIGRATION=y
+CONFIG_RESOURCES_64BIT=y
CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y
CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y
CONFIG_NR_CPUS=32
CONFIG_HOTPLUG_CPU=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_HPET_TIMER=y
CONFIG_HPET_EMULATE_RTC=y
-CONFIG_GART_IOMMU=y
+CONFIG_IOMMU=y
+# CONFIG_CALGARY_IOMMU is not set
CONFIG_SWIOTLB=y
CONFIG_X86_MCE=y
CONFIG_X86_MCE_INTEL=y
@@ -153,11 +162,13 @@ CONFIG_X86_MCE_AMD=y
# CONFIG_CRASH_DUMP is not set
CONFIG_PHYSICAL_START=0x200000
CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
# CONFIG_HZ_100 is not set
CONFIG_HZ_250=y
# CONFIG_HZ_1000 is not set
CONFIG_HZ=250
# CONFIG_REORDER is not set
+CONFIG_K8_NB=y
CONFIG_GENERIC_HARDIRQS=y
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_ISA_DMA_API=y
@@ -169,6 +180,7 @@ CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_PM=y
# CONFIG_PM_LEGACY is not set
# CONFIG_PM_DEBUG is not set
+# CONFIG_PM_SYSFS_DEPRECATED is not set
CONFIG_SOFTWARE_SUSPEND=y
CONFIG_PM_STD_PARTITION=""
CONFIG_SUSPEND_SMP=y
@@ -186,13 +198,14 @@ CONFIG_ACPI_BUTTON=y
# CONFIG_ACPI_VIDEO is not set
# CONFIG_ACPI_HOTKEY is not set
CONFIG_ACPI_FAN=y
+# CONFIG_ACPI_DOCK is not set
CONFIG_ACPI_PROCESSOR=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_ACPI_THERMAL=y
CONFIG_ACPI_NUMA=y
# CONFIG_ACPI_ASUS is not set
# CONFIG_ACPI_IBM is not set
-CONFIG_ACPI_TOSHIBA=y
+# CONFIG_ACPI_TOSHIBA is not set
CONFIG_ACPI_BLACKLIST_YEAR=0
# CONFIG_ACPI_DEBUG is not set
CONFIG_ACPI_EC=y
@@ -200,14 +213,14 @@ CONFIG_ACPI_POWER=y
CONFIG_ACPI_SYSTEM=y
CONFIG_X86_PM_TIMER=y
CONFIG_ACPI_CONTAINER=y
-CONFIG_ACPI_HOTPLUG_MEMORY=y
+# CONFIG_ACPI_SBS is not set
#
# CPU Frequency scaling
#
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_TABLE=y
-# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_DEBUG=y
CONFIG_CPU_FREQ_STAT=y
# CONFIG_CPU_FREQ_STAT_DETAILS is not set
CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
@@ -240,6 +253,7 @@ CONFIG_PCI_DIRECT=y
CONFIG_PCI_MMCONFIG=y
CONFIG_PCIEPORTBUS=y
CONFIG_PCI_MSI=y
+# CONFIG_PCI_MULTITHREAD_PROBE is not set
# CONFIG_PCI_DEBUG is not set
#
@@ -293,19 +307,29 @@ CONFIG_IP_PNP_DHCP=y
# CONFIG_INET_IPCOMP is not set
# CONFIG_INET_XFRM_TUNNEL is not set
# CONFIG_INET_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
CONFIG_INET_DIAG=y
CONFIG_INET_TCP_DIAG=y
# CONFIG_TCP_CONG_ADVANCED is not set
-CONFIG_TCP_CONG_BIC=y
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_DEFAULT_TCP_CONG="cubic"
CONFIG_IPV6=y
# CONFIG_IPV6_PRIVACY is not set
# CONFIG_IPV6_ROUTER_PREF is not set
# CONFIG_INET6_AH is not set
# CONFIG_INET6_ESP is not set
# CONFIG_INET6_IPCOMP is not set
+# CONFIG_IPV6_MIP6 is not set
# CONFIG_INET6_XFRM_TUNNEL is not set
# CONFIG_INET6_TUNNEL is not set
+# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET6_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
# CONFIG_IPV6_TUNNEL is not set
+# CONFIG_IPV6_SUBTREES is not set
+# CONFIG_IPV6_MULTIPLE_TABLES is not set
+# CONFIG_NETWORK_SECMARK is not set
# CONFIG_NETFILTER is not set
#
@@ -331,7 +355,6 @@ CONFIG_IPV6=y
# CONFIG_ATALK is not set
# CONFIG_X25 is not set
# CONFIG_LAPB is not set
-# CONFIG_NET_DIVERT is not set
# CONFIG_ECONET is not set
# CONFIG_WAN_ROUTER is not set
@@ -344,6 +367,7 @@ CONFIG_IPV6=y
# Network testing
#
# CONFIG_NET_PKTGEN is not set
+# CONFIG_NET_TCPPROBE is not set
# CONFIG_HAMRADIO is not set
# CONFIG_IRDA is not set
# CONFIG_BT is not set
@@ -360,6 +384,7 @@ CONFIG_STANDALONE=y
CONFIG_PREVENT_FIRMWARE_BUILD=y
CONFIG_FW_LOADER=y
# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_SYS_HYPERVISOR is not set
#
# Connector - unified userspace <-> kernelspace linker
@@ -398,6 +423,7 @@ CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_COUNT=16
CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
CONFIG_BLK_DEV_INITRD=y
# CONFIG_CDROM_PKTCDVD is not set
# CONFIG_ATA_OVER_ETH is not set
@@ -470,6 +496,7 @@ CONFIG_IDEDMA_AUTO=y
#
# CONFIG_RAID_ATTRS is not set
CONFIG_SCSI=y
+CONFIG_SCSI_NETLINK=y
# CONFIG_SCSI_PROC_FS is not set
#
@@ -478,8 +505,9 @@ CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
# CONFIG_CHR_DEV_ST is not set
# CONFIG_CHR_DEV_OSST is not set
-# CONFIG_BLK_DEV_SR is not set
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_BLK_DEV_SR=y
+# CONFIG_BLK_DEV_SR_VENDOR is not set
+CONFIG_CHR_DEV_SG=y
# CONFIG_CHR_DEV_SCH is not set
#
@@ -490,12 +518,13 @@ CONFIG_SCSI_CONSTANTS=y
# CONFIG_SCSI_LOGGING is not set
#
-# SCSI Transport Attributes
+# SCSI Transports
#
CONFIG_SCSI_SPI_ATTRS=y
CONFIG_SCSI_FC_ATTRS=y
# CONFIG_SCSI_ISCSI_ATTRS is not set
-# CONFIG_SCSI_SAS_ATTRS is not set
+CONFIG_SCSI_SAS_ATTRS=y
+# CONFIG_SCSI_SAS_LIBSAS is not set
#
# SCSI low-level drivers
@@ -514,28 +543,14 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000
# CONFIG_AIC79XX_DEBUG_ENABLE is not set
CONFIG_AIC79XX_DEBUG_MASK=0
# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set
+# CONFIG_SCSI_AIC94XX is not set
+# CONFIG_SCSI_ARCMSR is not set
CONFIG_MEGARAID_NEWGEN=y
CONFIG_MEGARAID_MM=y
CONFIG_MEGARAID_MAILBOX=y
# CONFIG_MEGARAID_LEGACY is not set
CONFIG_MEGARAID_SAS=y
-CONFIG_SCSI_SATA=y
-CONFIG_SCSI_SATA_AHCI=y
-# CONFIG_SCSI_SATA_SVW is not set
-CONFIG_SCSI_ATA_PIIX=y
-# CONFIG_SCSI_SATA_MV is not set
-CONFIG_SCSI_SATA_NV=y
-# CONFIG_SCSI_PDC_ADMA is not set
-# CONFIG_SCSI_SATA_QSTOR is not set
-# CONFIG_SCSI_SATA_PROMISE is not set
-# CONFIG_SCSI_SATA_SX4 is not set
-CONFIG_SCSI_SATA_SIL=y
-# CONFIG_SCSI_SATA_SIL24 is not set
-# CONFIG_SCSI_SATA_SIS is not set
-# CONFIG_SCSI_SATA_ULI is not set
-CONFIG_SCSI_SATA_VIA=y
-# CONFIG_SCSI_SATA_VITESSE is not set
-CONFIG_SCSI_SATA_INTEL_COMBINED=y
+# CONFIG_SCSI_HPTIOP is not set
# CONFIG_SCSI_BUSLOGIC is not set
# CONFIG_SCSI_DMX3191D is not set
# CONFIG_SCSI_EATA is not set
@@ -544,6 +559,7 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
# CONFIG_SCSI_IPS is not set
# CONFIG_SCSI_INITIO is not set
# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_STEX is not set
# CONFIG_SCSI_SYM53C8XX_2 is not set
# CONFIG_SCSI_IPR is not set
# CONFIG_SCSI_QLOGIC_1280 is not set
@@ -554,6 +570,62 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y
# CONFIG_SCSI_DEBUG is not set
#
+# Serial ATA (prod) and Parallel ATA (experimental) drivers
+#
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_SATA_SVW=y
+CONFIG_ATA_PIIX=y
+# CONFIG_SATA_MV is not set
+CONFIG_SATA_NV=y
+# CONFIG_PDC_ADMA is not set
+# CONFIG_SATA_QSTOR is not set
+# CONFIG_SATA_PROMISE is not set
+# CONFIG_SATA_SX4 is not set
+CONFIG_SATA_SIL=y
+# CONFIG_SATA_SIL24 is not set
+# CONFIG_SATA_SIS is not set
+# CONFIG_SATA_ULI is not set
+CONFIG_SATA_VIA=y
+# CONFIG_SATA_VITESSE is not set
+CONFIG_SATA_INTEL_COMBINED=y
+# CONFIG_PATA_ALI is not set
+# CONFIG_PATA_AMD is not set
+# CONFIG_PATA_ARTOP is not set
+# CONFIG_PATA_ATIIXP is not set
+# CONFIG_PATA_CMD64X is not set
+# CONFIG_PATA_CS5520 is not set
+# CONFIG_PATA_CS5530 is not set
+# CONFIG_PATA_CYPRESS is not set
+# CONFIG_PATA_EFAR is not set
+# CONFIG_ATA_GENERIC is not set
+# CONFIG_PATA_HPT366 is not set
+# CONFIG_PATA_HPT37X is not set
+# CONFIG_PATA_HPT3X2N is not set
+# CONFIG_PATA_HPT3X3 is not set
+# CONFIG_PATA_IT821X is not set
+# CONFIG_PATA_JMICRON is not set
+# CONFIG_PATA_LEGACY is not set
+# CONFIG_PATA_TRIFLEX is not set
+# CONFIG_PATA_MPIIX is not set
+# CONFIG_PATA_OLDPIIX is not set
+# CONFIG_PATA_NETCELL is not set
+# CONFIG_PATA_NS87410 is not set
+# CONFIG_PATA_OPTI is not set
+# CONFIG_PATA_OPTIDMA is not set
+# CONFIG_PATA_PDC_OLD is not set
+# CONFIG_PATA_QDI is not set
+# CONFIG_PATA_RADISYS is not set
+# CONFIG_PATA_RZ1000 is not set
+# CONFIG_PATA_SC1200 is not set
+# CONFIG_PATA_SERVERWORKS is not set
+# CONFIG_PATA_PDC2027X is not set
+# CONFIG_PATA_SIL680 is not set
+# CONFIG_PATA_SIS is not set
+# CONFIG_PATA_VIA is not set
+# CONFIG_PATA_WINBOND is not set
+
+#
# Multi-device support (RAID and LVM)
#
CONFIG_MD=y
@@ -571,7 +643,7 @@ CONFIG_BLK_DEV_DM=y
CONFIG_FUSION=y
CONFIG_FUSION_SPI=y
# CONFIG_FUSION_FC is not set
-# CONFIG_FUSION_SAS is not set
+CONFIG_FUSION_SAS=y
CONFIG_FUSION_MAX_SGE=128
# CONFIG_FUSION_CTL is not set
@@ -591,10 +663,7 @@ CONFIG_IEEE1394=y
#
# Device Drivers
#
-
-#
-# Texas Instruments PCILynx requires I2C
-#
+# CONFIG_IEEE1394_PCILYNX is not set
CONFIG_IEEE1394_OHCI1394=y
#
@@ -645,14 +714,24 @@ CONFIG_VORTEX=y
#
# Tulip family network device support
#
-# CONFIG_NET_TULIP is not set
+CONFIG_NET_TULIP=y
+# CONFIG_DE2104X is not set
+CONFIG_TULIP=y
+# CONFIG_TULIP_MWI is not set
+# CONFIG_TULIP_MMIO is not set
+# CONFIG_TULIP_NAPI is not set
+# CONFIG_DE4X5 is not set
+# CONFIG_WINBOND_840 is not set
+# CONFIG_DM9102 is not set
+# CONFIG_ULI526X is not set
# CONFIG_HP100 is not set
CONFIG_NET_PCI=y
# CONFIG_PCNET32 is not set
# CONFIG_AMD8111_ETH is not set
# CONFIG_ADAPTEC_STARFIRE is not set
-# CONFIG_B44 is not set
+CONFIG_B44=y
CONFIG_FORCEDETH=y
+# CONFIG_FORCEDETH_NAPI is not set
# CONFIG_DGRS is not set
# CONFIG_EEPRO100 is not set
CONFIG_E100=y
@@ -688,7 +767,8 @@ CONFIG_E1000=y
# CONFIG_SK98LIN is not set
# CONFIG_VIA_VELOCITY is not set
CONFIG_TIGON3=y
-# CONFIG_BNX2 is not set
+CONFIG_BNX2=y
+# CONFIG_QLA3XXX is not set
#
# Ethernet (10000 Mbit)
@@ -697,6 +777,7 @@ CONFIG_TIGON3=y
# CONFIG_IXGB is not set
CONFIG_S2IO=m
# CONFIG_S2IO_NAPI is not set
+# CONFIG_MYRI10GE is not set
#
# Token Ring devices
@@ -786,6 +867,7 @@ CONFIG_SERIO_LIBPS2=y
CONFIG_VT=y
CONFIG_VT_CONSOLE=y
CONFIG_HW_CONSOLE=y
+# CONFIG_VT_HW_CONSOLE_BINDING is not set
# CONFIG_SERIAL_NONSTANDARD is not set
#
@@ -816,45 +898,11 @@ CONFIG_LEGACY_PTY_COUNT=256
#
# Watchdog Cards
#
-CONFIG_WATCHDOG=y
-# CONFIG_WATCHDOG_NOWAYOUT is not set
-
-#
-# Watchdog Device Drivers
-#
-CONFIG_SOFT_WATCHDOG=y
-# CONFIG_ACQUIRE_WDT is not set
-# CONFIG_ADVANTECH_WDT is not set
-# CONFIG_ALIM1535_WDT is not set
-# CONFIG_ALIM7101_WDT is not set
-# CONFIG_SC520_WDT is not set
-# CONFIG_EUROTECH_WDT is not set
-# CONFIG_IB700_WDT is not set
-# CONFIG_IBMASR is not set
-# CONFIG_WAFER_WDT is not set
-# CONFIG_I6300ESB_WDT is not set
-# CONFIG_I8XX_TCO is not set
-# CONFIG_SC1200_WDT is not set
-# CONFIG_60XX_WDT is not set
-# CONFIG_SBC8360_WDT is not set
-# CONFIG_CPU5_WDT is not set
-# CONFIG_W83627HF_WDT is not set
-# CONFIG_W83877F_WDT is not set
-# CONFIG_W83977F_WDT is not set
-# CONFIG_MACHZ_WDT is not set
-# CONFIG_SBC_EPX_C3_WATCHDOG is not set
-
-#
-# PCI-based Watchdog Cards
-#
-# CONFIG_PCIPCWATCHDOG is not set
-# CONFIG_WDTPCI is not set
-
-#
-# USB-based Watchdog Cards
-#
-# CONFIG_USBPCWATCHDOG is not set
+# CONFIG_WATCHDOG is not set
CONFIG_HW_RANDOM=y
+CONFIG_HW_RANDOM_INTEL=y
+CONFIG_HW_RANDOM_AMD=y
+# CONFIG_HW_RANDOM_GEODE is not set
# CONFIG_NVRAM is not set
CONFIG_RTC=y
# CONFIG_DTLK is not set
@@ -871,6 +919,7 @@ CONFIG_AGP_INTEL=y
# CONFIG_AGP_VIA is not set
# CONFIG_DRM is not set
# CONFIG_MWAVE is not set
+# CONFIG_PC8736x_GPIO is not set
CONFIG_RAW_DRIVER=y
CONFIG_MAX_RAW_DEVS=256
CONFIG_HPET=y
@@ -887,7 +936,56 @@ CONFIG_HPET_MMAP=y
#
# I2C support
#
-# CONFIG_I2C is not set
+CONFIG_I2C=m
+CONFIG_I2C_CHARDEV=m
+
+#
+# I2C Algorithms
+#
+# CONFIG_I2C_ALGOBIT is not set
+# CONFIG_I2C_ALGOPCF is not set
+# CONFIG_I2C_ALGOPCA is not set
+
+#
+# I2C Hardware Bus support
+#
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_I810 is not set
+# CONFIG_I2C_PIIX4 is not set
+CONFIG_I2C_ISA=m
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_OCORES is not set
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_PROSAVAGE is not set
+# CONFIG_I2C_SAVAGE4 is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_STUB is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+# CONFIG_I2C_VOODOO3 is not set
+# CONFIG_I2C_PCA_ISA is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_SENSORS_DS1337 is not set
+# CONFIG_SENSORS_DS1374 is not set
+# CONFIG_SENSORS_EEPROM is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
#
# SPI support
@@ -898,14 +996,51 @@ CONFIG_HPET_MMAP=y
#
# Dallas's 1-wire bus
#
-# CONFIG_W1 is not set
#
# Hardware Monitoring support
#
CONFIG_HWMON=y
# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_ABITUGURU is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM1026 is not set
+# CONFIG_SENSORS_ADM1031 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATXP1 is not set
+# CONFIG_SENSORS_DS1621 is not set
# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_FSCHER is not set
+# CONFIG_SENSORS_FSCPOS is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_LM63 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM77 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM87 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_LM92 is not set
+# CONFIG_SENSORS_MAX1619 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47M192 is not set
+CONFIG_SENSORS_SMSC47B397=m
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT8231 is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83791D is not set
+# CONFIG_SENSORS_W83792D is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
# CONFIG_SENSORS_HDAPS is not set
# CONFIG_HWMON_DEBUG_CHIP is not set
@@ -918,6 +1053,7 @@ CONFIG_HWMON=y
# Multimedia devices
#
# CONFIG_VIDEO_DEV is not set
+CONFIG_VIDEO_V4L2=y
#
# Digital Video Broadcasting Devices
@@ -928,8 +1064,8 @@ CONFIG_HWMON=y
#
# Graphics support
#
+# CONFIG_FIRMWARE_EDID is not set
# CONFIG_FB is not set
-CONFIG_VIDEO_SELECT=y
#
# Console display driver support
@@ -937,7 +1073,9 @@ CONFIG_VIDEO_SELECT=y
CONFIG_VGA_CONSOLE=y
CONFIG_VGACON_SOFT_SCROLLBACK=y
CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256
+CONFIG_VIDEO_SELECT=y
CONFIG_DUMMY_CONSOLE=y
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
#
# Sound
@@ -953,28 +1091,17 @@ CONFIG_SOUND=y
# Open Sound System
#
CONFIG_SOUND_PRIME=y
-CONFIG_OBSOLETE_OSS_DRIVER=y
+CONFIG_OSS_OBSOLETE_DRIVER=y
# CONFIG_SOUND_BT878 is not set
-# CONFIG_SOUND_CMPCI is not set
# CONFIG_SOUND_EMU10K1 is not set
# CONFIG_SOUND_FUSION is not set
-# CONFIG_SOUND_CS4281 is not set
-# CONFIG_SOUND_ES1370 is not set
# CONFIG_SOUND_ES1371 is not set
-# CONFIG_SOUND_ESSSOLO1 is not set
-# CONFIG_SOUND_MAESTRO is not set
-# CONFIG_SOUND_MAESTRO3 is not set
CONFIG_SOUND_ICH=y
-# CONFIG_SOUND_SONICVIBES is not set
# CONFIG_SOUND_TRIDENT is not set
# CONFIG_SOUND_MSNDCLAS is not set
# CONFIG_SOUND_MSNDPIN is not set
# CONFIG_SOUND_VIA82CXXX is not set
# CONFIG_SOUND_OSS is not set
-# CONFIG_SOUND_ALI5455 is not set
-# CONFIG_SOUND_FORTE is not set
-# CONFIG_SOUND_RME96XX is not set
-# CONFIG_SOUND_AD1980 is not set
#
# USB support
@@ -1000,6 +1127,7 @@ CONFIG_USB_DEVICEFS=y
CONFIG_USB_EHCI_HCD=y
# CONFIG_USB_EHCI_SPLIT_ISO is not set
# CONFIG_USB_EHCI_ROOT_HUB_TT is not set
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
# CONFIG_USB_ISP116X_HCD is not set
CONFIG_USB_OHCI_HCD=y
# CONFIG_USB_OHCI_BIG_ENDIAN is not set
@@ -1089,10 +1217,12 @@ CONFIG_USB_MON=y
# CONFIG_USB_LEGOTOWER is not set
# CONFIG_USB_LCD is not set
# CONFIG_USB_LED is not set
+# CONFIG_USB_CYPRESS_CY7C63 is not set
# CONFIG_USB_CYTHERM is not set
# CONFIG_USB_PHIDGETKIT is not set
# CONFIG_USB_PHIDGETSERVO is not set
# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_APPLEDISPLAY is not set
# CONFIG_USB_SISUSBVGA is not set
# CONFIG_USB_LD is not set
# CONFIG_USB_TEST is not set
@@ -1128,7 +1258,6 @@ CONFIG_USB_MON=y
# InfiniBand support
#
# CONFIG_INFINIBAND is not set
-# CONFIG_IPATH_CORE is not set
#
# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
@@ -1141,6 +1270,19 @@ CONFIG_USB_MON=y
# CONFIG_RTC_CLASS is not set
#
+# DMA Engine support
+#
+# CONFIG_DMA_ENGINE is not set
+
+#
+# DMA Clients
+#
+
+#
+# DMA Devices
+#
+
+#
# Firmware Drivers
#
# CONFIG_EDD is not set
@@ -1175,9 +1317,10 @@ CONFIG_FS_POSIX_ACL=y
# CONFIG_MINIX_FS is not set
# CONFIG_ROMFS_FS is not set
CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
# CONFIG_QUOTA is not set
CONFIG_DNOTIFY=y
-CONFIG_AUTOFS_FS=y
+# CONFIG_AUTOFS_FS is not set
CONFIG_AUTOFS4_FS=y
# CONFIG_FUSE_FS is not set
@@ -1316,26 +1459,38 @@ CONFIG_KPROBES=y
#
# Kernel hacking
#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
CONFIG_MAGIC_SYSRQ=y
+CONFIG_UNUSED_SYMBOLS=y
CONFIG_DEBUG_KERNEL=y
CONFIG_LOG_BUF_SHIFT=18
CONFIG_DETECT_SOFTLOCKUP=y
# CONFIG_SCHEDSTATS is not set
# CONFIG_DEBUG_SLAB is not set
-# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_RT_MUTEXES is not set
+# CONFIG_RT_MUTEX_TESTER is not set
# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_RWSEMS is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
# CONFIG_DEBUG_SPINLOCK_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
# CONFIG_DEBUG_KOBJECT is not set
# CONFIG_DEBUG_INFO is not set
CONFIG_DEBUG_FS=y
# CONFIG_DEBUG_VM is not set
# CONFIG_FRAME_POINTER is not set
-# CONFIG_UNWIND_INFO is not set
+CONFIG_UNWIND_INFO=y
+CONFIG_STACK_UNWIND=y
# CONFIG_FORCED_INLINING is not set
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_DEBUG_RODATA is not set
# CONFIG_IOMMU_DEBUG is not set
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_STACK_USAGE is not set
#
# Security options
@@ -1349,13 +1504,11 @@ CONFIG_DEBUG_FS=y
# CONFIG_CRYPTO is not set
#
-# Hardware crypto devices
-#
-
-#
# Library routines
#
# CONFIG_CRC_CCITT is not set
# CONFIG_CRC16 is not set
CONFIG_CRC32=y
# CONFIG_LIBCRC32C is not set
+CONFIG_ZLIB_INFLATE=y
+CONFIG_PLIST=y
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index e9263b4975e0..cdae36435e21 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -11,6 +11,9 @@ obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
+audit-class-$(CONFIG_AUDIT) := audit.o
+obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y)
+
$(obj)/syscall32_syscall.o: \
$(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so)
@@ -20,6 +23,7 @@ targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so)
# The DSO images are built using a special linker script
quiet_cmd_syscall = SYSCALL $@
cmd_syscall = $(CC) -m32 -nostdlib -shared -s \
+ $(call ld-option, -Wl$(comma)--hash-style=sysv) \
-Wl,-soname=linux-gate.so.1 -o $@ \
-Wl,-T,$(filter-out FORCE,$^)
diff --git a/arch/x86_64/ia32/audit.c b/arch/x86_64/ia32/audit.c
new file mode 100644
index 000000000000..92d7d0c8d93f
--- /dev/null
+++ b/arch/x86_64/ia32/audit.c
@@ -0,0 +1,37 @@
+#include <asm-i386/unistd.h>
+
+unsigned ia32_dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+unsigned ia32_chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+unsigned ia32_write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+unsigned ia32_read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+int ia32_classify_syscall(unsigned syscall)
+{
+ switch(syscall) {
+ case __NR_open:
+ return 2;
+ case __NR_openat:
+ return 3;
+ case __NR_socketcall:
+ return 4;
+ case __NR_execve:
+ return 5;
+ default:
+ return 1;
+ }
+}
diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c
index 1c23095f1813..2c8209a3605a 100644
--- a/arch/x86_64/ia32/fpu32.c
+++ b/arch/x86_64/ia32/fpu32.c
@@ -2,7 +2,6 @@
* Copyright 2002 Andi Kleen, SuSE Labs.
* FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.
* This is used for ptrace, signals and coredumps in 32bit emulation.
- * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $
*/
#include <linux/sched.h>
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3bf58af98936..396d3c100011 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -333,7 +333,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
return error;
}
- error = bprm->file->f_op->read(bprm->file, (char *)text_addr,
+ error = bprm->file->f_op->read(bprm->file,
+ (char __user *)text_addr,
ex.a_text+ex.a_data, &pos);
if ((signed long)error < 0) {
send_sig(SIGKILL, current, 0);
@@ -366,7 +367,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
down_write(&current->mm->mmap_sem);
do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
up_write(&current->mm->mmap_sem);
- bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex),
+ bprm->file->f_op->read(bprm->file,
+ (char __user *)N_TXTADDR(ex),
ex.a_text+ex.a_data, &pos);
flush_icache_range((unsigned long) N_TXTADDR(ex),
(unsigned long) N_TXTADDR(ex) +
@@ -477,7 +479,7 @@ static int load_aout_library(struct file *file)
do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
up_write(&current->mm->mmap_sem);
- file->f_op->read(file, (char *)start_addr,
+ file->f_op->read(file, (char __user *)start_addr,
ex.a_text + ex.a_data, &pos);
flush_icache_range((unsigned long) start_addr,
(unsigned long) start_addr + ex.a_text + ex.a_data);
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 926c4743d13b..2fd5a67fd435 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -73,39 +73,44 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG];
* Dumping its extra ELF program headers includes all the other information
* a debugger needs to easily find how the vsyscall DSO was being used.
*/
-#define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum)
+#define ELF_CORE_EXTRA_PHDRS (find_vma(current->mm, VSYSCALL32_BASE) ? \
+ (VSYSCALL32_EHDR->e_phnum) : 0)
#define ELF_CORE_WRITE_EXTRA_PHDRS \
do { \
- const struct elf32_phdr *const vsyscall_phdrs = \
- (const struct elf32_phdr *) (VSYSCALL32_BASE \
- + VSYSCALL32_EHDR->e_phoff); \
- int i; \
- Elf32_Off ofs = 0; \
- for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
- struct elf32_phdr phdr = vsyscall_phdrs[i]; \
- if (phdr.p_type == PT_LOAD) { \
- BUG_ON(ofs != 0); \
- ofs = phdr.p_offset = offset; \
- phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \
- phdr.p_filesz = phdr.p_memsz; \
- offset += phdr.p_filesz; \
+ if (find_vma(current->mm, VSYSCALL32_BASE)) { \
+ const struct elf32_phdr *const vsyscall_phdrs = \
+ (const struct elf32_phdr *) (VSYSCALL32_BASE \
+ + VSYSCALL32_EHDR->e_phoff);\
+ int i; \
+ Elf32_Off ofs = 0; \
+ for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
+ struct elf32_phdr phdr = vsyscall_phdrs[i]; \
+ if (phdr.p_type == PT_LOAD) { \
+ BUG_ON(ofs != 0); \
+ ofs = phdr.p_offset = offset; \
+ phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \
+ phdr.p_filesz = phdr.p_memsz; \
+ offset += phdr.p_filesz; \
+ } \
+ else \
+ phdr.p_offset += ofs; \
+ phdr.p_paddr = 0; /* match other core phdrs */ \
+ DUMP_WRITE(&phdr, sizeof(phdr)); \
} \
- else \
- phdr.p_offset += ofs; \
- phdr.p_paddr = 0; /* match other core phdrs */ \
- DUMP_WRITE(&phdr, sizeof(phdr)); \
} \
} while (0)
#define ELF_CORE_WRITE_EXTRA_DATA \
do { \
- const struct elf32_phdr *const vsyscall_phdrs = \
- (const struct elf32_phdr *) (VSYSCALL32_BASE \
- + VSYSCALL32_EHDR->e_phoff); \
- int i; \
- for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
- if (vsyscall_phdrs[i].p_type == PT_LOAD) \
- DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \
- PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \
+ if (find_vma(current->mm, VSYSCALL32_BASE)) { \
+ const struct elf32_phdr *const vsyscall_phdrs = \
+ (const struct elf32_phdr *) (VSYSCALL32_BASE \
+ + VSYSCALL32_EHDR->e_phoff); \
+ int i; \
+ for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \
+ if (vsyscall_phdrs[i].p_type == PT_LOAD) \
+ DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr,\
+ PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \
+ } \
} \
} while (0)
@@ -182,7 +187,7 @@ struct elf_prpsinfo
#define user user32
#define __ASM_X86_64_ELF_H 1
-#define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack))
+#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X)
//#include <asm/ia32.h>
#include <linux/elf.h>
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index e0a92439f634..a6ba9951e86c 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -6,8 +6,6 @@
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen
- *
- * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $
*/
#include <linux/sched.h>
@@ -115,25 +113,19 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
}
asmlinkage long
-sys32_sigsuspend(int history0, int history1, old_sigset_t mask,
- struct pt_regs *regs)
+sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
{
- sigset_t saveset;
-
mask &= _BLOCKABLE;
spin_lock_irq(&current->sighand->siglock);
- saveset = current->blocked;
+ current->saved_sigmask = current->blocked;
siginitset(&current->blocked, mask);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- regs->rax = -EINTR;
- while (1) {
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- if (do_signal(regs, &saveset))
- return -EINTR;
- }
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ set_thread_flag(TIF_RESTORE_SIGMASK);
+ return -ERESTARTNOHAND;
}
asmlinkage long
@@ -439,15 +431,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto give_sigsegv;
- {
- struct exec_domain *ed = current_thread_info()->exec_domain;
- err |= __put_user((ed
- && ed->signal_invmap
- && sig < 32
- ? ed->signal_invmap[sig]
- : sig),
- &frame->sig);
- }
+ err |= __put_user(sig, &frame->sig);
if (err)
goto give_sigsegv;
@@ -494,6 +478,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->rsp = (unsigned long) frame;
regs->rip = (unsigned long) ka->sa.sa_handler;
+ /* Make -mregparm=3 work */
+ regs->rax = sig;
+ regs->rdx = 0;
+ regs->rcx = 0;
+
asm volatile("movl %0,%%ds" :: "r" (__USER32_DS));
asm volatile("movl %0,%%es" :: "r" (__USER32_DS));
@@ -501,20 +490,20 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->ss = __USER32_DS;
set_fs(USER_DS);
- regs->eflags &= ~TF_MASK;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
+ regs->eflags &= ~TF_MASK;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
#if DEBUG_SIG
printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return 1;
+ return 0;
give_sigsegv:
force_sigsegv(sig, current);
- return 0;
+ return -EFAULT;
}
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -597,18 +586,18 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->ss = __USER32_DS;
set_fs(USER_DS);
- regs->eflags &= ~TF_MASK;
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
+ regs->eflags &= ~TF_MASK;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
#if DEBUG_SIG
printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return 1;
+ return 0;
give_sigsegv:
force_sigsegv(sig, current);
- return 0;
+ return -EFAULT;
}
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index 4ec594ab1a98..b4aa875e175b 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -13,6 +13,7 @@
#include <asm/thread_info.h>
#include <asm/segment.h>
#include <asm/vsyscall32.h>
+#include <asm/irqflags.h>
#include <linux/linkage.h>
#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
@@ -70,11 +71,16 @@
*/
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
swapgs
movq %gs:pda_kernelstack, %rsp
addq $(PDA_STACKOFFSET),%rsp
+ /*
+ * No need to follow this irqs on/off section: the syscall
+ * disabled irqs, here we enable it straight after entry:
+ */
sti
movl %ebp,%ebp /* zero extension */
pushq $__USER32_DS
@@ -98,7 +104,7 @@ ENTRY(ia32_sysenter_target)
pushq %rax
CFI_ADJUST_CFA_OFFSET 8
cld
- SAVE_ARGS 0,0,1
+ SAVE_ARGS 0,0,0
/* no need to do an access_ok check here because rbp has been
32bit zero extended */
1: movl (%rbp),%r9d
@@ -118,6 +124,7 @@ sysenter_do_call:
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
cli
+ TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
jnz int_ret_from_sys_call
andl $~TS_COMPAT,threadinfo_status(%r10)
@@ -132,6 +139,7 @@ sysenter_do_call:
CFI_REGISTER rsp,rcx
movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */
CFI_REGISTER rip,rdx
+ TRACE_IRQS_ON
swapgs
sti /* sti only takes effect after the next instruction */
/* sysexit */
@@ -155,6 +163,7 @@ sysenter_tracesys:
.previous
jmp sysenter_do_call
CFI_ENDPROC
+ENDPROC(ia32_sysenter_target)
/*
* 32bit SYSCALL instruction entry.
@@ -178,13 +187,18 @@ sysenter_tracesys:
*/
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
- CFI_DEF_CFA rsp,0
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,PDA_STACKOFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
swapgs
movl %esp,%r8d
CFI_REGISTER rsp,r8
movq %gs:pda_kernelstack,%rsp
+ /*
+ * No need to follow this irqs on/off section: the syscall
+ * disabled irqs and here we enable it straight after entry:
+ */
sti
SAVE_ARGS 8,1,1
movl %eax,%eax /* zero extension */
@@ -219,6 +233,7 @@ cstar_do_call:
movq %rax,RAX-ARGOFFSET(%rsp)
GET_THREAD_INFO(%r10)
cli
+ TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10)
jnz int_ret_from_sys_call
andl $~TS_COMPAT,threadinfo_status(%r10)
@@ -227,6 +242,7 @@ cstar_do_call:
CFI_REGISTER rip,rcx
movl EFLAGS-ARGOFFSET(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
+ TRACE_IRQS_ON
movl RSP-ARGOFFSET(%rsp),%esp
CFI_RESTORE rsp
swapgs
@@ -249,6 +265,7 @@ cstar_tracesys:
.quad 1b,ia32_badarg
.previous
jmp cstar_do_call
+END(ia32_cstar_target)
ia32_badarg:
movq $-EFAULT,%rax
@@ -278,13 +295,18 @@ ia32_badarg:
ENTRY(ia32_syscall)
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-RIP
/*CFI_REL_OFFSET ss,SS-RIP*/
CFI_REL_OFFSET rsp,RSP-RIP
/*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
- swapgs
+ swapgs
+ /*
+ * No need to follow this irqs on/off section: the syscall
+ * disabled irqs and here we enable it straight after entry:
+ */
sti
movl %eax,%eax
pushq %rax
@@ -314,16 +336,13 @@ ia32_tracesys:
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
jmp ia32_do_syscall
+END(ia32_syscall)
ia32_badsys:
movq $0,ORIG_RAX-ARGOFFSET(%rsp)
movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
jmp int_ret_from_sys_call
-ni_syscall:
- movq %rax,%rdi
- jmp sys32_ni_syscall
-
quiet_ni_syscall:
movq $-ENOSYS,%rax
ret
@@ -354,6 +373,7 @@ ENTRY(ia32_ptregs_common)
popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-ARGOFFSET
CFI_REL_OFFSET rax,RAX-ARGOFFSET
CFI_REL_OFFSET rcx,RCX-ARGOFFSET
@@ -370,10 +390,10 @@ ENTRY(ia32_ptregs_common)
RESTORE_REST
jmp ia32_sysret /* misbalances the return cache */
CFI_ENDPROC
+END(ia32_ptregs_common)
.section .rodata,"a"
.align 8
- .globl ia32_sys_call_table
ia32_sys_call_table:
.quad sys_restart_syscall
.quad sys_exit
@@ -687,8 +707,8 @@ ia32_sys_call_table:
.quad sys_readlinkat /* 305 */
.quad sys_fchmodat
.quad sys_faccessat
- .quad quiet_ni_syscall /* pselect6 for now */
- .quad quiet_ni_syscall /* ppoll for now */
+ .quad compat_sys_pselect6
+ .quad compat_sys_ppoll
.quad sys_unshare /* 310 */
.quad compat_sys_set_robust_list
.quad compat_sys_get_robust_list
@@ -697,4 +717,5 @@ ia32_sys_call_table:
.quad sys_tee
.quad compat_sys_vmsplice
.quad compat_sys_move_pages
+ .quad sys_getcpu
ia32_syscall_end:
diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c
index 23a4515a73b4..d18198ed636b 100644
--- a/arch/x86_64/ia32/ptrace32.c
+++ b/arch/x86_64/ia32/ptrace32.c
@@ -7,8 +7,6 @@
*
* This allows to access 64bit processes too; but there is no way to see the extended
* register contents.
- *
- * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $
*/
#include <linux/kernel.h>
@@ -27,6 +25,7 @@
#include <asm/debugreg.h>
#include <asm/i387.h>
#include <asm/fpu32.h>
+#include <asm/ia32.h>
/*
* Determines which flags the user has access to [1 = access, 0 = no access].
@@ -118,6 +117,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val)
if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1)
return -EIO;
child->thread.debugreg7 = val;
+ if (val)
+ set_tsk_thread_flag(child, TIF_DEBUG);
+ else
+ clear_tsk_thread_flag(child, TIF_DEBUG);
break;
default:
@@ -199,6 +202,31 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
#undef R32
+static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+{
+ int ret;
+ compat_siginfo_t *si32 = (compat_siginfo_t *)compat_ptr(data);
+ siginfo_t ssi;
+ siginfo_t *si = compat_alloc_user_space(sizeof(siginfo_t));
+ if (request == PTRACE_SETSIGINFO) {
+ memset(&ssi, 0, sizeof(siginfo_t));
+ ret = copy_siginfo_from_user32(&ssi, si32);
+ if (ret)
+ return ret;
+ if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
+ return -EFAULT;
+ }
+ ret = sys_ptrace(request, pid, addr, (unsigned long)si);
+ if (ret)
+ return ret;
+ if (request == PTRACE_GETSIGINFO) {
+ if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
+ return -EFAULT;
+ ret = copy_siginfo_to_user32(si32, &ssi);
+ }
+ return ret;
+}
+
asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
{
struct task_struct *child;
@@ -208,9 +236,19 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
__u32 val;
switch (request) {
- default:
+ case PTRACE_TRACEME:
+ case PTRACE_ATTACH:
+ case PTRACE_KILL:
+ case PTRACE_CONT:
+ case PTRACE_SINGLESTEP:
+ case PTRACE_DETACH:
+ case PTRACE_SYSCALL:
+ case PTRACE_SETOPTIONS:
return sys_ptrace(request, pid, addr, data);
+ default:
+ return -EINVAL;
+
case PTRACE_PEEKTEXT:
case PTRACE_PEEKDATA:
case PTRACE_POKEDATA:
@@ -225,10 +263,11 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
case PTRACE_GETFPXREGS:
case PTRACE_GETEVENTMSG:
break;
- }
- if (request == PTRACE_TRACEME)
- return ptrace_traceme();
+ case PTRACE_SETSIGINFO:
+ case PTRACE_GETSIGINFO:
+ return ptrace32_siginfo(request, pid, addr, data);
+ }
child = ptrace_get_task_struct(pid);
if (IS_ERR(child))
@@ -336,8 +375,10 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
ret = -EIO;
if (!access_ok(VERIFY_READ, u, sizeof(*u)))
break;
- /* no checking to be bug-to-bug compatible with i386 */
- __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u));
+ /* no checking to be bug-to-bug compatible with i386. */
+ /* but silence warning */
+ if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)))
+ ;
set_stopped_child_used_math(child);
child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
ret = 0;
@@ -349,8 +390,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
break;
default:
- ret = -EINVAL;
- break;
+ BUG();
}
out:
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index f182b20858e2..f280d3665f4b 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -20,7 +20,6 @@
* This should be fixed.
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -61,6 +60,7 @@
#include <linux/highuid.h>
#include <linux/vmalloc.h>
#include <linux/fsnotify.h>
+#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/types.h>
#include <asm/uaccess.h>
@@ -390,7 +390,9 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
}
}
set_fs (KERNEL_DS);
- ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL,
+ ret = sys_rt_sigprocmask(how,
+ set ? (sigset_t __user *)&s : NULL,
+ oset ? (sigset_t __user *)&s : NULL,
sigsetsize);
set_fs (old_fs);
if (ret) return ret;
@@ -508,19 +510,6 @@ sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)
return compat_sys_wait4(pid, stat_addr, options, NULL);
}
-int sys32_ni_syscall(int call)
-{
- struct task_struct *me = current;
- static char lastcomm[sizeof(me->comm)];
-
- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- printk(KERN_INFO "IA32 syscall %d from %s not implemented\n",
- call, me->comm);
- strncpy(lastcomm, me->comm, sizeof(lastcomm));
- }
- return -ENOSYS;
-}
-
/* 32-bit timeval and related flotsam. */
asmlinkage long
@@ -555,7 +544,7 @@ sys32_sysinfo(struct sysinfo32 __user *info)
int bitcount = 0;
set_fs (KERNEL_DS);
- ret = sys_sysinfo(&s);
+ ret = sys_sysinfo((struct sysinfo __user *)&s);
set_fs (old_fs);
/* Check to see if any memory value is too large for 32-bit and scale
@@ -603,7 +592,7 @@ sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *int
mm_segment_t old_fs = get_fs ();
set_fs (KERNEL_DS);
- ret = sys_sched_rr_get_interval(pid, &t);
+ ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
set_fs (old_fs);
if (put_compat_timespec(&t, interval))
return -EFAULT;
@@ -619,7 +608,7 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize)
mm_segment_t old_fs = get_fs();
set_fs (KERNEL_DS);
- ret = sys_rt_sigpending(&s, sigsetsize);
+ ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize);
set_fs (old_fs);
if (!ret) {
switch (_NSIG_WORDS) {
@@ -644,7 +633,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo)
if (copy_siginfo_from_user32(&info, uinfo))
return -EFAULT;
set_fs (KERNEL_DS);
- ret = sys_rt_sigqueueinfo(pid, sig, &info);
+ ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info);
set_fs (old_fs);
return ret;
}
@@ -659,7 +648,7 @@ sys32_pause(void)
}
-#ifdef CONFIG_SYSCTL
+#ifdef CONFIG_SYSCTL_SYSCALL
struct sysctl_ia32 {
unsigned int name;
int nlen;
@@ -680,9 +669,6 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
size_t oldlen;
int __user *namep;
long ret;
- extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
- void *newval, size_t newlen);
-
if (copy_from_user(&a32, args32, sizeof (a32)))
return -EFAULT;
@@ -706,7 +692,8 @@ sys32_sysctl(struct sysctl_ia32 __user *args32)
set_fs(KERNEL_DS);
lock_kernel();
- ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen);
+ ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
+ newvalp, (size_t) a32.newlen);
unlock_kernel();
set_fs(old_fs);
@@ -757,7 +744,8 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count)
return -EFAULT;
set_fs(KERNEL_DS);
- ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count);
+ ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
+ count);
set_fs(old_fs);
if (offset && put_user(of, offset))
@@ -792,7 +780,7 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
{
- int error;
+ int err;
if (!name)
return -EFAULT;
@@ -801,27 +789,31 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name)
down_read(&uts_sem);
- error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN);
- __put_user(0,name->sysname+__OLD_UTS_LEN);
- __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN);
- __put_user(0,name->nodename+__OLD_UTS_LEN);
- __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN);
- __put_user(0,name->release+__OLD_UTS_LEN);
- __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN);
- __put_user(0,name->version+__OLD_UTS_LEN);
+ err = __copy_to_user(&name->sysname,&system_utsname.sysname,
+ __OLD_UTS_LEN);
+ err |= __put_user(0,name->sysname+__OLD_UTS_LEN);
+ err |= __copy_to_user(&name->nodename,&system_utsname.nodename,
+ __OLD_UTS_LEN);
+ err |= __put_user(0,name->nodename+__OLD_UTS_LEN);
+ err |= __copy_to_user(&name->release,&system_utsname.release,
+ __OLD_UTS_LEN);
+ err |= __put_user(0,name->release+__OLD_UTS_LEN);
+ err |= __copy_to_user(&name->version,&system_utsname.version,
+ __OLD_UTS_LEN);
+ err |= __put_user(0,name->version+__OLD_UTS_LEN);
{
char *arch = "x86_64";
if (personality(current->personality) == PER_LINUX32)
arch = "i686";
- __copy_to_user(&name->machine,arch,strlen(arch)+1);
+ err |= __copy_to_user(&name->machine,arch,strlen(arch)+1);
}
up_read(&uts_sem);
- error = error ? -EFAULT : 0;
+ err = err ? -EFAULT : 0;
- return error;
+ return err;
}
long sys32_uname(struct old_utsname __user * name)
@@ -845,7 +837,7 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p)
seg = get_fs();
set_fs(KERNEL_DS);
- ret = sys_ustat(dev,&u);
+ ret = sys_ustat(dev, (struct ustat __user *)&u);
set_fs(seg);
if (ret >= 0) {
if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) ||
@@ -916,7 +908,7 @@ long sys32_vm86_warning(void)
struct task_struct *me = current;
static char lastcomm[sizeof(me->comm)];
if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
+ compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",
me->comm);
strncpy(lastcomm, me->comm, sizeof(lastcomm));
}
@@ -929,13 +921,3 @@ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
}
-static int __init ia32_init (void)
-{
- printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");
- return 0;
-}
-
-__initcall(ia32_init);
-
-extern unsigned long ia32_sys_call_table[];
-EXPORT_SYMBOL(ia32_sys_call_table);
diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds
index f2e75ed4c6c7..1dc86ff5bcb9 100644
--- a/arch/x86_64/ia32/vsyscall.lds
+++ b/arch/x86_64/ia32/vsyscall.lds
@@ -11,6 +11,7 @@ SECTIONS
. = VSYSCALL_BASE + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 059c88313f4e..3c7cbff04d3d 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -8,9 +8,10 @@ obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
x8664_ksyms.o i387.o syscall.o vsyscall.o \
setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
- pci-dma.o pci-nommu.o
+ pci-dma.o pci-nommu.o alternative.o
-obj-$(CONFIG_X86_MCE) += mce.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
+obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/
@@ -19,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o
obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_X86_CPUID) += cpuid.o
obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
-obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
+obj-y += apic.o nmi.o
+obj-y += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
@@ -28,19 +29,24 @@ obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
+obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o
+obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o
obj-$(CONFIG_X86_VSMP) += vsmp.o
+obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_PCI) += early-quirks.o
obj-y += topology.o
obj-y += intel_cacheinfo.o
CFLAGS_vsyscall.o := $(PROFILING) -g0
+therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o
bootflag-y += ../../i386/kernel/bootflag.o
cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o
topology-y += ../../i386/kernel/topology.o
@@ -49,3 +55,4 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o
quirks-y += ../../i386/kernel/quirks.o
i8237-y += ../../i386/kernel/i8237.o
msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o
+alternative-y += ../../i386/kernel/alternative.o
diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c
index 091bc79c888f..5ebf62c7a3d2 100644
--- a/arch/x86_64/kernel/acpi/sleep.c
+++ b/arch/x86_64/kernel/acpi/sleep.c
@@ -26,7 +26,6 @@
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/types.h>
diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c
index 70b9d21ed675..b487396c4c5b 100644
--- a/arch/x86_64/kernel/aperture.c
+++ b/arch/x86_64/kernel/aperture.c
@@ -8,9 +8,7 @@
* because only the bootmem allocator can allocate 32+MB.
*
* Copyright 2002 Andi Kleen, SuSE Labs.
- * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -19,11 +17,13 @@
#include <linux/pci_ids.h>
#include <linux/pci.h>
#include <linux/bitops.h>
+#include <linux/ioport.h>
#include <asm/e820.h>
#include <asm/io.h>
#include <asm/proto.h>
#include <asm/pci-direct.h>
#include <asm/dma.h>
+#include <asm/k8.h>
int iommu_aperture;
int iommu_aperture_disabled __initdata = 0;
@@ -34,11 +34,21 @@ int fallback_aper_force __initdata = 0;
int fix_aperture __initdata = 1;
+static struct resource gart_resource = {
+ .name = "GART",
+ .flags = IORESOURCE_MEM,
+};
+
+static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
+{
+ gart_resource.start = aper_base;
+ gart_resource.end = aper_base + aper_size - 1;
+ insert_resource(&iomem_resource, &gart_resource);
+}
+
/* This code runs before the PCI subsystem is initialized, so just
access the northbridge directly. */
-#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16))
-
static u32 __init allocate_aperture(void)
{
pg_data_t *nd0 = NODE_DATA(0);
@@ -51,7 +61,7 @@ static u32 __init allocate_aperture(void)
/*
* Aperture has to be naturally aligned. This means an 2GB aperture won't
- * have much chances to find a place in the lower 4GB of memory.
+ * have much chance of finding a place in the lower 4GB of memory.
* Unfortunately we cannot move it up because that would make the
* IOMMU useless.
*/
@@ -65,23 +75,24 @@ static u32 __init allocate_aperture(void)
}
printk("Mapping aperture over %d KB of RAM @ %lx\n",
aper_size >> 10, __pa(p));
+ insert_aperture_resource((u32)__pa(p), aper_size);
return (u32)__pa(p);
}
-static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)
+static int __init aperture_valid(u64 aper_base, u32 aper_size)
{
if (!aper_base)
return 0;
if (aper_size < 64*1024*1024) {
- printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20);
+ printk("Aperture too small (%d MB)\n", aper_size>>20);
return 0;
}
if (aper_base + aper_size >= 0xffffffff) {
- printk("Aperture from %s beyond 4GB. Ignoring.\n",name);
+ printk("Aperture beyond 4GB. Ignoring.\n");
return 0;
}
if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
- printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name);
+ printk("Aperture pointing to e820 RAM. Ignoring.\n");
return 0;
}
return 1;
@@ -140,7 +151,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)
printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",
aper, 32 << *order, apsizereg);
- if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order))
+ if (!aperture_valid(aper, (32*1024*1024) << *order))
return 0;
return (u32)aper;
}
@@ -201,17 +212,17 @@ void __init iommu_hole_init(void)
u64 aper_base, last_aper_base = 0;
int valid_agp = 0;
- if (iommu_aperture_disabled || !fix_aperture)
+ if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed())
return;
printk("Checking aperture...\n");
fix = 0;
for (num = 24; num < 32; num++) {
- char name[30];
- if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
- continue;
+ if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
+ continue;
+ iommu_detected = 1;
iommu_aperture = 1;
aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;
@@ -222,9 +233,7 @@ void __init iommu_hole_init(void)
printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,
aper_base, aper_size>>20);
- sprintf(name, "northbridge cpu %d", num-24);
-
- if (!aperture_valid(name, aper_base, aper_size)) {
+ if (!aperture_valid(aper_base, aper_size)) {
fix = 1;
break;
}
@@ -238,8 +247,13 @@ void __init iommu_hole_init(void)
last_aper_base = aper_base;
}
- if (!fix && !fallback_aper_force)
+ if (!fix && !fallback_aper_force) {
+ if (last_aper_base) {
+ unsigned long n = (32 * 1024 * 1024) << last_aper_order;
+ insert_aperture_resource((u32)last_aper_base, n);
+ }
return;
+ }
if (!fallback_aper_force)
aper_alloc = search_agp_bridge(&aper_order, &valid_agp);
@@ -273,7 +287,7 @@ void __init iommu_hole_init(void)
/* Fix up the north bridges */
for (num = 24; num < 32; num++) {
- if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)
+ if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))
continue;
/* Don't enable translation yet. That is done later.
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 29ef99001e05..135ff25e6b44 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -14,7 +14,6 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -26,6 +25,7 @@
#include <linux/kernel_stat.h>
#include <linux/sysdev.h>
#include <linux/module.h>
+#include <linux/ioport.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -37,13 +37,20 @@
#include <asm/idle.h>
#include <asm/proto.h>
#include <asm/timex.h>
+#include <asm/apic.h>
+int apic_mapped;
int apic_verbosity;
int apic_runs_main_timer;
int apic_calibrate_pmtmr __initdata;
int disable_apic_timer __initdata;
+static struct resource lapic_resource = {
+ .name = "Local APIC",
+ .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
+};
+
/*
* cpu_mask that denotes the CPUs that needs timer interrupt coming in as
* IPIs in place of local APIC timers
@@ -100,7 +107,7 @@ void clear_local_APIC(void)
maxlvt = get_maxlvt();
/*
- * Masking an LVT entry on a P6 can trigger a local APIC error
+ * Masking an LVT entry can trigger a local APIC error
* if the vector is zero. Mask LVTERR first to prevent this.
*/
if (maxlvt >= 3) {
@@ -137,72 +144,40 @@ void clear_local_APIC(void)
apic_read(APIC_ESR);
}
-void __init connect_bsp_APIC(void)
-{
- if (pic_mode) {
- /*
- * Do not trust the local APIC being empty at bootup.
- */
- clear_local_APIC();
- /*
- * PIC mode, enable APIC mode in the IMCR, i.e.
- * connect BSP's local APIC to INT and NMI lines.
- */
- apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n");
- outb(0x70, 0x22);
- outb(0x01, 0x23);
- }
-}
-
void disconnect_bsp_APIC(int virt_wire_setup)
{
- if (pic_mode) {
- /*
- * Put the board back into PIC mode (has an effect
- * only on certain older boards). Note that APIC
- * interrupts, including IPIs, won't work beyond
- * this point! The only exception are INIT IPIs.
- */
- apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n");
- outb(0x70, 0x22);
- outb(0x00, 0x23);
- }
- else {
- /* Go back to Virtual Wire compatibility mode */
- unsigned long value;
-
- /* For the spurious interrupt use vector F, and enable it */
- value = apic_read(APIC_SPIV);
- value &= ~APIC_VECTOR_MASK;
- value |= APIC_SPIV_APIC_ENABLED;
- value |= 0xf;
- apic_write(APIC_SPIV, value);
-
- if (!virt_wire_setup) {
- /* For LVT0 make it edge triggered, active high, external and enabled */
- value = apic_read(APIC_LVT0);
- value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
- APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
- value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
- apic_write(APIC_LVT0, value);
- }
- else {
- /* Disable LVT0 */
- apic_write(APIC_LVT0, APIC_LVT_MASKED);
- }
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
- /* For LVT1 make it edge triggered, active high, nmi and enabled */
- value = apic_read(APIC_LVT1);
- value &= ~(
- APIC_MODE_MASK | APIC_SEND_PENDING |
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
- APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
- value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
- apic_write(APIC_LVT1, value);
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write(APIC_LVT0, value);
+ } else {
+ /* Disable LVT0 */
+ apic_write(APIC_LVT0, APIC_LVT_MASKED);
}
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write(APIC_LVT1, value);
}
void disable_local_APIC(void)
@@ -298,8 +273,6 @@ void __init sync_Arb_IDs(void)
| APIC_DM_INIT);
}
-extern void __error_in_apic_c (void);
-
/*
* An initial setup of the virtual wire mode.
*/
@@ -346,8 +319,7 @@ void __cpuinit setup_local_APIC (void)
value = apic_read(APIC_LVR);
- if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
- __error_in_apic_c();
+ BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
/*
* Double-check whether this APIC is really registered.
@@ -400,32 +372,8 @@ void __cpuinit setup_local_APIC (void)
*/
value |= APIC_SPIV_APIC_ENABLED;
- /*
- * Some unknown Intel IO/APIC (or APIC) errata is biting us with
- * certain networking cards. If high frequency interrupts are
- * happening on a particular IOAPIC pin, plus the IOAPIC routing
- * entry is masked/unmasked at a high rate as well then sooner or
- * later IOAPIC line gets 'stuck', no more interrupts are received
- * from the device. If focus CPU is disabled then the hang goes
- * away, oh well :-(
- *
- * [ This bug can be reproduced easily with a level-triggered
- * PCI Ne2000 networking cards and PII/PIII processors, dual
- * BX chipset. ]
- */
- /*
- * Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
- * like LRU than MRU (the short-term load is more even across CPUs).
- * See also the comment in end_level_ioapic_irq(). --macro
- */
-#if 1
- /* Enable focus processor (bit==0) */
- value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
- /* Disable focus processor (bit==1) */
- value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
+ /* We always use processor focus */
+
/*
* Set spurious IRQ vector
*/
@@ -443,7 +391,7 @@ void __cpuinit setup_local_APIC (void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
+ if (!smp_processor_id() && !value) {
value = APIC_DM_EXTINT;
apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id());
} else {
@@ -480,8 +428,7 @@ void __cpuinit setup_local_APIC (void)
}
nmi_watchdog_default();
- if (nmi_watchdog == NMI_LOCAL_APIC)
- setup_apic_nmi_watchdog();
+ setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -528,8 +475,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
- local_save_flags(flags);
- local_irq_disable();
+ local_irq_save(flags);
disable_local_APIC();
local_irq_restore(flags);
return 0;
@@ -607,18 +553,24 @@ static void apic_pm_activate(void) { }
static int __init apic_set_verbosity(char *str)
{
+ if (str == NULL) {
+ skip_ioapic_setup = 0;
+ ioapic_force = 1;
+ return 0;
+ }
if (strcmp("debug", str) == 0)
apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", str) == 0)
apic_verbosity = APIC_VERBOSE;
- else
+ else {
printk(KERN_WARNING "APIC Verbosity level %s not recognised"
- " use apic=verbose or apic=debug", str);
+ " use apic=verbose or apic=debug\n", str);
+ return -EINVAL;
+ }
- return 1;
+ return 0;
}
-
-__setup("apic=", apic_set_verbosity);
+early_param("apic", apic_set_verbosity);
/*
* Detect and enable local APICs on non-SMP boards.
@@ -639,6 +591,40 @@ static int __init detect_init_APIC (void)
return 0;
}
+#ifdef CONFIG_X86_IO_APIC
+static struct resource * __init ioapic_setup_resources(void)
+{
+#define IOAPIC_RESOURCE_NAME_SIZE 11
+ unsigned long n;
+ struct resource *res;
+ char *mem;
+ int i;
+
+ if (nr_ioapics <= 0)
+ return NULL;
+
+ n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
+ n *= nr_ioapics;
+
+ res = alloc_bootmem(n);
+
+ if (!res)
+ return NULL;
+
+ memset(res, 0, n);
+ mem = (void *)&res[nr_ioapics];
+
+ for (i = 0; i < nr_ioapics; i++) {
+ res[i].name = mem;
+ res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
+ mem += IOAPIC_RESOURCE_NAME_SIZE;
+ }
+
+ return res;
+}
+#endif
+
void __init init_apic_mappings(void)
{
unsigned long apic_phys;
@@ -655,19 +641,26 @@ void __init init_apic_mappings(void)
apic_phys = mp_lapic_addr;
set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ apic_mapped = 1;
apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys);
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-#ifdef CONFIG_X86_IO_APIC
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
int i;
+ struct resource *ioapic_res;
+ ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mpc_apicaddr;
@@ -679,9 +672,15 @@ void __init init_apic_mappings(void)
apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
+
+ if (ioapic_res) {
+ ioapic_res->start = ioapic_phys;
+ ioapic_res->end = ioapic_phys + (4 * 1024) - 1;
+ insert_resource(&iomem_resource, ioapic_res);
+ ioapic_res++;
+ }
}
}
-#endif
}
/*
@@ -851,7 +850,18 @@ void disable_APIC_timer(void)
unsigned long v;
v = apic_read(APIC_LVTT);
- apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
+ /*
+ * When an illegal vector value (0-15) is written to an LVT
+ * entry and delivery mode is Fixed, the APIC may signal an
+ * illegal vector error, with out regard to whether the mask
+ * bit is set or whether an interrupt is actually seen on input.
+ *
+ * Boot sequence might call this function when the LVTT has
+ * '0' vector value. So make sure vector field is set to
+ * valid value.
+ */
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
}
}
@@ -909,15 +919,13 @@ int setup_profiling_timer(unsigned int multiplier)
return -EINVAL;
}
-#ifdef CONFIG_X86_MCE_AMD
-void setup_threshold_lvt(unsigned long lvt_off)
+void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector,
+ unsigned char msg_type, unsigned char mask)
{
- unsigned int v = 0;
- unsigned long reg = (lvt_off << 4) + 0x500;
- v |= THRESHOLD_APIC_VECTOR;
+ unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE;
+ unsigned int v = (mask << 16) | (msg_type << 8) | vector;
apic_write(reg, v);
}
-#endif /* CONFIG_X86_MCE_AMD */
#undef APIC_DIVISOR
@@ -943,7 +951,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs)
* We take the 'long' return path, and there every subsystem
* grabs the appropriate locks (kernel lock/ irq lock).
*
- * we might want to decouple profiling from the 'long path',
+ * We might want to decouple profiling from the 'long path',
* and do the profiling totally in assembly.
*
* Currently this isn't too much of an issue (performance wise),
@@ -983,7 +991,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
}
/*
- * oem_force_hpet_timer -- force HPET mode for some boxes.
+ * apic_is_clustered_box() -- Check if we can expect good TSC
*
* Thus far, the major user of this is IBM's Summit2 series:
*
@@ -991,7 +999,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
* multi-chassis. Use available data to take a good guess.
* If in doubt, go HPET.
*/
-__cpuinit int oem_force_hpet_timer(void)
+__cpuinit int apic_is_clustered_box(void)
{
int i, clusters, zeros;
unsigned id;
@@ -1022,8 +1030,7 @@ __cpuinit int oem_force_hpet_timer(void)
}
/*
- * If clusters > 2, then should be multi-chassis. Return 1 for HPET.
- * Else return 0 to use TSC.
+ * If clusters > 2, then should be multi-chassis.
* May have to revisit this when multi-core + hyperthreaded CPUs come
* out, but AFAIK this will work even for them.
*/
@@ -1116,19 +1123,15 @@ int __init APIC_init_uniprocessor (void)
verify_local_APIC();
- connect_bsp_APIC();
-
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
setup_local_APIC();
-#ifdef CONFIG_X86_IO_APIC
if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
+ setup_IO_APIC();
else
nr_ioapics = 0;
-#endif
setup_boot_APIC_clock();
check_nmi_watchdog();
return 0;
@@ -1137,14 +1140,17 @@ int __init APIC_init_uniprocessor (void)
static __init int setup_disableapic(char *str)
{
disable_apic = 1;
- return 1;
-}
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+ return 0;
+}
+early_param("disableapic", setup_disableapic);
+/* same as disableapic, for compatibility */
static __init int setup_nolapic(char *str)
{
- disable_apic = 1;
- return 1;
+ return setup_disableapic(str);
}
+early_param("nolapic", setup_nolapic);
static __init int setup_noapictimer(char *str)
{
@@ -1177,11 +1183,5 @@ static __init int setup_apicpmtimer(char *s)
}
__setup("apicpmtimer", setup_apicpmtimer);
-/* dummy parsing: see setup.c */
-
-__setup("disableapic", setup_disableapic);
-__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */
-
__setup("noapictimer", setup_noapictimer);
-/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */
diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index 38834bbbae11..96687e2beb2c 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
* and format the required data.
*/
+#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/stddef.h>
#include <linux/errno.h>
@@ -68,5 +69,7 @@ int main(void)
DEFINE(pbe_next, offsetof(struct pbe, next));
BLANK();
DEFINE(TSS_ist, offsetof(struct tss_struct, ist));
+ BLANK();
+ DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
return 0;
}
diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c
new file mode 100644
index 000000000000..21f33387bef3
--- /dev/null
+++ b/arch/x86_64/kernel/audit.c
@@ -0,0 +1,64 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/audit.h>
+#include <asm/unistd.h>
+
+static unsigned dir_class[] = {
+#include <asm-generic/audit_dir_write.h>
+~0U
+};
+
+static unsigned read_class[] = {
+#include <asm-generic/audit_read.h>
+~0U
+};
+
+static unsigned write_class[] = {
+#include <asm-generic/audit_write.h>
+~0U
+};
+
+static unsigned chattr_class[] = {
+#include <asm-generic/audit_change_attr.h>
+~0U
+};
+
+int audit_classify_syscall(int abi, unsigned syscall)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern int ia32_classify_syscall(unsigned);
+ if (abi == AUDIT_ARCH_I386)
+ return ia32_classify_syscall(syscall);
+#endif
+ switch(syscall) {
+ case __NR_open:
+ return 2;
+ case __NR_openat:
+ return 3;
+ case __NR_execve:
+ return 5;
+ default:
+ return 0;
+ }
+}
+
+static int __init audit_classes_init(void)
+{
+#ifdef CONFIG_IA32_EMULATION
+ extern __u32 ia32_dir_class[];
+ extern __u32 ia32_write_class[];
+ extern __u32 ia32_read_class[];
+ extern __u32 ia32_chattr_class[];
+ audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class);
+ audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class);
+#endif
+ audit_register_class(AUDIT_CLASS_WRITE, write_class);
+ audit_register_class(AUDIT_CLASS_READ, read_class);
+ audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class);
+ audit_register_class(AUDIT_CLASS_CHATTR, chattr_class);
+ return 0;
+}
+
+__initcall(audit_classes_init);
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 4e6c3b729e39..3525f884af82 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -23,6 +23,7 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/mach_apic.h>
+#include <asm/kdebug.h>
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
@@ -68,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
* for the data I pass, and I need tags
* on the data to indicate what information I have
* squirrelled away. ELF notes happen to provide
- * all of that that no need to invent something new.
+ * all of that, no need to invent something new.
*/
buf = (u32*)per_cpu_ptr(crash_notes, cpu);
@@ -95,15 +96,25 @@ static void crash_save_self(struct pt_regs *regs)
#ifdef CONFIG_SMP
static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
{
+ struct pt_regs *regs;
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ regs = ((struct die_args *)data)->regs;
+ cpu = raw_smp_processor_id();
+
/*
* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return 1;
+ return NOTIFY_STOP;
local_irq_disable();
crash_save_this_cpu(regs, cpu);
@@ -111,14 +122,14 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
for(;;)
- asm("hlt");
+ halt();
return 1;
}
static void smp_send_nmi_allbutself(void)
{
- send_IPI_allbutself(APIC_DM_NMI);
+ send_IPI_allbutself(NMI_VECTOR);
}
/*
@@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void)
* cpu hotplug shouldn't matter.
*/
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- set_nmi_callback(crash_nmi_callback);
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
/*
* Ensure the new callback function is set before sending
@@ -161,7 +177,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
{
/*
* This function is only called after the system
- * has paniced or is otherwise in a critical state.
+ * has panicked or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
*
@@ -178,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs)
if(cpu_has_apic)
disable_local_APIC();
-#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
-#endif
crash_save_self(regs);
}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 1ef6028f721e..b3f0908668ec 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -1,7 +1,6 @@
/*
* Handle the memory map.
* The functions here do the job until bootmem takes over.
- * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $
*
* Getting sanitize_e820_map() in sync with i386 version by applying change:
* - Provisions for empty E820 memory regions (reported by certain BIOSes).
@@ -9,7 +8,6 @@
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
*
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
@@ -18,13 +16,17 @@
#include <linux/string.h>
#include <linux/kexec.h>
#include <linux/module.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/bootsetup.h>
#include <asm/sections.h>
+struct e820map e820 __initdata;
+
/*
* PFN of last memory page.
*/
@@ -41,7 +43,7 @@ unsigned long end_pfn_map;
/*
* Last pfn which the user wants to use.
*/
-unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT;
+static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT;
extern struct resource code_resource, data_resource;
@@ -70,9 +72,8 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size)
return 1;
}
#endif
- /* kernel code + 640k memory hole (later should not be needed, but
- be paranoid for now) */
- if (last >= 640*1024 && addr < __pa_symbol(&_end)) {
+ /* kernel code */
+ if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) {
*addrp = __pa_symbol(&_end);
return 1;
}
@@ -161,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi
return -1UL;
}
-/*
- * Free bootmem based on the e820 table for a node.
- */
-void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end)
-{
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr && last-addr >= PAGE_SIZE)
- free_bootmem_node(pgdat, addr, last-addr);
- }
-}
-
/*
* Find the highest page frame number we have available
*/
unsigned long __init e820_end_of_ram(void)
{
- int i;
unsigned long end_pfn = 0;
+ end_pfn = find_max_pfn_with_active_regions();
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = round_up(ei->addr, PAGE_SIZE);
- end = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (start >= end)
- continue;
- if (ei->type == E820_RAM) {
- if (end > end_pfn<<PAGE_SHIFT)
- end_pfn = end>>PAGE_SHIFT;
- } else {
- if (end > end_pfn_map<<PAGE_SHIFT)
- end_pfn_map = end>>PAGE_SHIFT;
- }
- }
-
if (end_pfn > end_pfn_map)
end_pfn_map = end_pfn;
if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
@@ -223,43 +179,10 @@ unsigned long __init e820_end_of_ram(void)
if (end_pfn > end_pfn_map)
end_pfn = end_pfn_map;
+ printk("end_pfn_map = %lu\n", end_pfn_map);
return end_pfn;
}
-/*
- * Compute how much memory is missing in a range.
- * Unlike the other functions in this file the arguments are in page numbers.
- */
-unsigned long __init
-e820_hole_size(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long ram = 0;
- unsigned long start = start_pfn << PAGE_SHIFT;
- unsigned long end = end_pfn << PAGE_SHIFT;
- int i;
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long last, addr;
-
- if (ei->type != E820_RAM ||
- ei->addr+ei->size <= start ||
- ei->addr >= end)
- continue;
-
- addr = round_up(ei->addr, PAGE_SIZE);
- if (addr < start)
- addr = start;
-
- last = round_down(ei->addr + ei->size, PAGE_SIZE);
- if (last >= end)
- last = end;
-
- if (last > addr)
- ram += last - addr;
- }
- return ((end - start) - ram) >> PAGE_SHIFT;
-}
-
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
@@ -294,6 +217,96 @@ void __init e820_reserve_resources(void)
}
}
+/* Mark pages corresponding to given address range as nosave */
+static void __init
+e820_mark_nosave_range(unsigned long start, unsigned long end)
+{
+ unsigned long pfn, max_pfn;
+
+ if (start >= end)
+ return;
+
+ printk("Nosave address range: %016lx - %016lx\n", start, end);
+ max_pfn = end >> PAGE_SHIFT;
+ for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++)
+ if (pfn_valid(pfn))
+ SetPageNosave(pfn_to_page(pfn));
+}
+
+/*
+ * Find the ranges of physical addresses that do not correspond to
+ * e820 RAM areas and mark the corresponding pages as nosave for software
+ * suspend and suspend to RAM.
+ *
+ * This function requires the e820 map to be sorted and without any
+ * overlapping entries and assumes the first e820 area to be RAM.
+ */
+void __init e820_mark_nosave_regions(void)
+{
+ int i;
+ unsigned long paddr;
+
+ paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE);
+ for (i = 1; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (paddr < ei->addr)
+ e820_mark_nosave_range(paddr,
+ round_up(ei->addr, PAGE_SIZE));
+
+ paddr = round_down(ei->addr + ei->size, PAGE_SIZE);
+ if (ei->type != E820_RAM)
+ e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE),
+ paddr);
+
+ if (paddr >= (end_pfn << PAGE_SHIFT))
+ break;
+ }
+}
+
+/* Walk the e820 map and register active regions within a node */
+void __init
+e820_register_active_regions(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ int i;
+ unsigned long ei_startpfn, ei_endpfn;
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE)
+ >> PAGE_SHIFT;
+
+ /* Skip map entries smaller than a page */
+ if (ei_startpfn > ei_endpfn)
+ continue;
+
+ /* Check if end_pfn_map should be updated */
+ if (ei->type != E820_RAM && ei_endpfn > end_pfn_map)
+ end_pfn_map = ei_endpfn;
+
+ /* Skip if map is outside the node */
+ if (ei->type != E820_RAM ||
+ ei_endpfn <= start_pfn ||
+ ei_startpfn >= end_pfn)
+ continue;
+
+ /* Check for overlaps */
+ if (ei_startpfn < start_pfn)
+ ei_startpfn = start_pfn;
+ if (ei_endpfn > end_pfn)
+ ei_endpfn = end_pfn;
+
+ /* Obey end_user_pfn to save on memmap */
+ if (ei_startpfn >= end_user_pfn)
+ continue;
+ if (ei_endpfn > end_user_pfn)
+ ei_endpfn = end_user_pfn;
+
+ add_active_range(nid, ei_startpfn, ei_endpfn);
+ }
+}
+
/*
* Add a memory region to the kernel e820 map.
*/
@@ -514,13 +527,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
* If we're lucky and live on a modern system, the setup code
* will have given us a memory map that we can use to properly
* set up memory. If we aren't, we'll fake a memory map.
- *
- * We check to see that the memory map contains at least 2 elements
- * before we'll use it, because the detection code in setup.S may
- * not be perfect and most every PC known to man has two memory
- * regions: one from 0 to 640k, and one from 1mb up. (The IBM
- * thinkpad 560x, for example, does not cooperate with the memory
- * detection code.)
*/
static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
@@ -538,34 +544,19 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
if (start > end)
return -1;
- /*
- * Some BIOSes claim RAM in the 640k - 1M region.
- * Not right. Fix it up.
- *
- * This should be removed on Hammer which is supposed to not
- * have non e820 covered ISA mappings there, but I had some strange
- * problems so it stays for now. -AK
- */
- if (type == E820_RAM) {
- if (start < 0x100000ULL && end > 0xA0000ULL) {
- if (start < 0xA0000ULL)
- add_memory_region(start, 0xA0000ULL-start, type);
- if (end <= 0x100000ULL)
- continue;
- start = 0x100000ULL;
- size = end - start;
- }
- }
-
add_memory_region(start, size, type);
} while (biosmap++,--nr_map);
return 0;
}
-void __init setup_memory_region(void)
+void early_panic(char *msg)
{
- char *who = "BIOS-e820";
+ early_printk(msg);
+ panic(msg);
+}
+void __init setup_memory_region(void)
+{
/*
* Try to copy the BIOS-supplied E820-map.
*
@@ -573,54 +564,74 @@ void __init setup_memory_region(void)
* the next section from 1mb->appropriate_mem_k
*/
sanitize_e820_map(E820_MAP, &E820_MAP_NR);
- if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) {
- unsigned long mem_size;
-
- /* compare results from other methods and take the greater */
- if (ALT_MEM_K < EXT_MEM_K) {
- mem_size = EXT_MEM_K;
- who = "BIOS-88";
- } else {
- mem_size = ALT_MEM_K;
- who = "BIOS-e801";
- }
-
- e820.nr_map = 0;
- add_memory_region(0, LOWMEMSIZE(), E820_RAM);
- add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM);
- }
+ if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0)
+ early_panic("Cannot find a valid memory map");
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- e820_print_map(who);
+ e820_print_map("BIOS-e820");
}
-void __init parse_memopt(char *p, char **from)
-{
- end_user_pfn = memparse(p, from);
+static int __init parse_memopt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ end_user_pfn = memparse(p, &p);
end_user_pfn >>= PAGE_SHIFT;
+ return 0;
}
+early_param("mem", parse_memopt);
+
+static int userdef __initdata;
-void __init parse_memmapopt(char *p, char **from)
+static int __init parse_memmap_opt(char *p)
{
+ char *oldp;
unsigned long long start_at, mem_size;
- mem_size = memparse(p, from);
- p = *from;
+ if (!strcmp(p, "exactmap")) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
+ */
+ saved_max_pfn = e820_end_of_ram();
+#endif
+ end_pfn_map = 0;
+ e820.nr_map = 0;
+ userdef = 1;
+ return 0;
+ }
+
+ oldp = p;
+ mem_size = memparse(p, &p);
+ if (p == oldp)
+ return -EINVAL;
if (*p == '@') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RAM);
} else if (*p == '#') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_ACPI);
} else if (*p == '$') {
- start_at = memparse(p+1, from);
+ start_at = memparse(p+1, &p);
add_memory_region(start_at, mem_size, E820_RESERVED);
} else {
end_user_pfn = (mem_size >> PAGE_SHIFT);
}
- p = *from;
+ return *p == '\0' ? 0 : -EINVAL;
+}
+early_param("memmap", parse_memmap_opt);
+
+void finish_e820_parsing(void)
+{
+ if (userdef) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ e820_print_map("user");
+ }
}
unsigned long pci_mem_start = 0xaeedbabe;
+EXPORT_SYMBOL(pci_mem_start);
/*
* Search for the biggest gap in the low 32 bits of the e820
diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c
new file mode 100644
index 000000000000..208e38a372c1
--- /dev/null
+++ b/arch/x86_64/kernel/early-quirks.c
@@ -0,0 +1,122 @@
+/* Various workarounds for chipset bugs.
+ This code runs very early and can't use the regular PCI subsystem
+ The entries are keyed to PCI bridges which usually identify chipsets
+ uniquely.
+ This is only for whole classes of chipsets with specific problems which
+ need early invasive action (e.g. before the timers are initialized).
+ Most PCI device specific workarounds can be done later and should be
+ in standard PCI quirks
+ Mainboard specific bugs should be handled by DMI entries.
+ CPU specific bugs in setup.c */
+
+#include <linux/pci.h>
+#include <linux/acpi.h>
+#include <linux/pci_ids.h>
+#include <asm/pci-direct.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
+
+static void via_bugs(void)
+{
+#ifdef CONFIG_IOMMU
+ if ((end_pfn > MAX_DMA32_PFN || force_iommu) &&
+ !iommu_aperture_allowed) {
+ printk(KERN_INFO
+ "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n");
+ iommu_aperture_disabled = 1;
+ }
+#endif
+}
+
+#ifdef CONFIG_ACPI
+
+static int nvidia_hpet_detected __initdata;
+
+static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
+{
+ nvidia_hpet_detected = 1;
+ return 0;
+}
+#endif
+
+static void nvidia_bugs(void)
+{
+#ifdef CONFIG_ACPI
+ /*
+ * All timer overrides on Nvidia are
+ * wrong unless HPET is enabled.
+ */
+ nvidia_hpet_detected = 0;
+ acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
+ if (nvidia_hpet_detected == 0) {
+ acpi_skip_timer_override = 1;
+ printk(KERN_INFO "Nvidia board "
+ "detected. Ignoring ACPI "
+ "timer override.\n");
+ }
+#endif
+ /* RED-PEN skip them on mptables too? */
+
+}
+
+static void ati_bugs(void)
+{
+#if 1 /* for testing */
+ printk("ATI board detected\n");
+#endif
+ /* No bugs right now */
+}
+
+struct chipset {
+ u16 vendor;
+ void (*f)(void);
+};
+
+static struct chipset early_qrk[] = {
+ { PCI_VENDOR_ID_NVIDIA, nvidia_bugs },
+ { PCI_VENDOR_ID_VIA, via_bugs },
+ { PCI_VENDOR_ID_ATI, ati_bugs },
+ {}
+};
+
+void __init early_quirks(void)
+{
+ int num, slot, func;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Poor man's PCI discovery */
+ for (num = 0; num < 32; num++) {
+ for (slot = 0; slot < 32; slot++) {
+ for (func = 0; func < 8; func++) {
+ u32 class;
+ u32 vendor;
+ u8 type;
+ int i;
+ class = read_pci_config(num,slot,func,
+ PCI_CLASS_REVISION);
+ if (class == 0xffffffff)
+ break;
+
+ if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
+ continue;
+
+ vendor = read_pci_config(num, slot, func,
+ PCI_VENDOR_ID);
+ vendor &= 0xffff;
+
+ for (i = 0; early_qrk[i].f; i++)
+ if (early_qrk[i].vendor == vendor) {
+ early_qrk[i].f();
+ return;
+ }
+
+ type = read_pci_config_byte(num, slot, func,
+ PCI_HEADER_TYPE);
+ if (!(type & 0x80))
+ break;
+ }
+ }
+ }
+}
diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c
index b93ef5b51980..e22ecd54870d 100644
--- a/arch/x86_64/kernel/early_printk.c
+++ b/arch/x86_64/kernel/early_printk.c
@@ -2,7 +2,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
@@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...)
static int __initdata keep_early;
-int __init setup_early_printk(char *opt)
+static int __init setup_early_printk(char *buf)
{
- char *space;
- char buf[256];
+ if (!buf)
+ return 0;
if (early_console_initialized)
- return 1;
-
- strlcpy(buf,opt,sizeof(buf));
- space = strchr(buf, ' ');
- if (space)
- *space = 0;
+ return 0;
+ early_console_initialized = 1;
- if (strstr(buf,"keep"))
+ if (!strcmp(buf,"keep"))
keep_early = 1;
if (!strncmp(buf, "serial", 6)) {
@@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt)
early_console = &simnow_console;
keep_early = 1;
}
- early_console_initialized = 1;
register_console(early_console);
return 0;
}
+early_param("earlyprintk", setup_early_printk);
+
void __init disable_early_printk(void)
{
if (!early_console_initialized || !early_console)
@@ -266,4 +263,3 @@ void __init disable_early_printk(void)
}
}
-__setup("earlyprintk=", setup_early_printk);
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 586b34c00c48..2802524104f3 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -4,8 +4,6 @@
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- *
- * $Id$
*/
/*
@@ -22,16 +20,25 @@
* at the top of the kernel process stack.
* - partial stack frame: partially saved registers upto R11.
* - full stack frame: Like partial stack frame, but all register saved.
- *
- * TODO:
- * - schedule it carefully for the final hardware.
+ *
+ * Some macro usage:
+ * - CFI macros are used to generate dwarf2 unwind information for better
+ * backtraces. They don't change any code.
+ * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
+ * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
+ * There are unfortunately lots of special cases where some registers
+ * not touched. The macro is a big mess that should be cleaned up.
+ * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
+ * Gives a full stack frame.
+ * - ENTRY/END Define functions in the symbol table.
+ * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
+ * frame that is otherwise undefined after a SYSCALL
+ * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
+ * - errorentry/paranoidentry/zeroentry - Define exception entry points.
*/
-#define ASSEMBLY 1
-#include <linux/config.h>
#include <linux/linkage.h>
#include <asm/segment.h>
-#include <asm/smp.h>
#include <asm/cache.h>
#include <asm/errno.h>
#include <asm/dwarf2.h>
@@ -42,13 +49,24 @@
#include <asm/thread_info.h>
#include <asm/hw_irq.h>
#include <asm/page.h>
+#include <asm/irqflags.h>
.code64
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
-
+
+
+.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ jnc 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
/*
* C code is not supposed to know about undefined top of stack. Every time
* a C function with an pt_regs argument is called from the SYSCALL based
@@ -105,6 +123,7 @@
.macro CFI_DEFAULT_STACK start=1
.if \start
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8
.else
CFI_DEF_CFA_OFFSET SS+8
@@ -136,6 +155,10 @@
/* rdi: prev */
ENTRY(ret_from_fork)
CFI_DEFAULT_STACK
+ push kernel_eflags(%rip)
+ CFI_ADJUST_CFA_OFFSET 4
+ popf # reset kernel eflags
+ CFI_ADJUST_CFA_OFFSET -4
call schedule_tail
GET_THREAD_INFO(%rcx)
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
@@ -154,6 +177,7 @@ rff_trace:
GET_THREAD_INFO(%rcx)
jmp rff_action
CFI_ENDPROC
+END(ret_from_fork)
/*
* System call entry. Upto 6 arguments in registers are supported.
@@ -188,12 +212,17 @@ rff_trace:
ENTRY(system_call)
CFI_STARTPROC simple
- CFI_DEF_CFA rsp,0
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA rsp,PDA_STACKOFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
swapgs
movq %rsp,%gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
+ /*
+ * No need to follow this irqs off/on section - it's straight
+ * and short:
+ */
sti
SAVE_ARGS 8,1
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
@@ -219,10 +248,15 @@ ret_from_sys_call:
sysret_check:
GET_THREAD_INFO(%rcx)
cli
+ TRACE_IRQS_OFF
movl threadinfo_flags(%rcx),%edx
andl %edi,%edx
CFI_REMEMBER_STATE
jnz sysret_careful
+ /*
+ * sysretq will re-enable interrupts:
+ */
+ TRACE_IRQS_ON
movq RIP-ARGOFFSET(%rsp),%rcx
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
@@ -237,6 +271,7 @@ sysret_careful:
CFI_RESTORE_STATE
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
+ TRACE_IRQS_ON
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
@@ -247,6 +282,7 @@ sysret_careful:
/* Handle a signal */
sysret_signal:
+ TRACE_IRQS_ON
sti
testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
jz 1f
@@ -261,6 +297,7 @@ sysret_signal:
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
cli
+ TRACE_IRQS_OFF
jmp int_with_check
badsys:
@@ -285,6 +322,7 @@ tracesys:
/* Use IRET because user could have changed frame */
jmp int_ret_from_sys_call
CFI_ENDPROC
+END(system_call)
/*
* Syscall return path ending with IRET.
@@ -292,6 +330,7 @@ tracesys:
*/
ENTRY(int_ret_from_sys_call)
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-ARGOFFSET
/*CFI_REL_OFFSET ss,SS-ARGOFFSET*/
CFI_REL_OFFSET rsp,RSP-ARGOFFSET
@@ -308,6 +347,7 @@ ENTRY(int_ret_from_sys_call)
CFI_REL_OFFSET r10,R10-ARGOFFSET
CFI_REL_OFFSET r11,R11-ARGOFFSET
cli
+ TRACE_IRQS_OFF
testl $3,CS-ARGOFFSET(%rsp)
je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
@@ -326,6 +366,7 @@ int_with_check:
int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc int_very_careful
+ TRACE_IRQS_ON
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
@@ -333,10 +374,12 @@ int_careful:
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
cli
+ TRACE_IRQS_OFF
jmp int_with_check
/* handle signals and tracing -- both require a full stack frame */
int_very_careful:
+ TRACE_IRQS_ON
sti
SAVE_REST
/* Check for syscall exit trace */
@@ -350,6 +393,7 @@ int_very_careful:
CFI_ADJUST_CFA_OFFSET -8
andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
cli
+ TRACE_IRQS_OFF
jmp int_restore_rest
int_signal:
@@ -362,8 +406,10 @@ int_signal:
int_restore_rest:
RESTORE_REST
cli
+ TRACE_IRQS_OFF
jmp int_with_check
CFI_ENDPROC
+END(int_ret_from_sys_call)
/*
* Certain special system calls that need to save a complete full stack frame.
@@ -375,6 +421,7 @@ int_restore_rest:
leaq \func(%rip),%rax
leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
jmp ptregscall_common
+END(\label)
.endm
CFI_STARTPROC
@@ -404,6 +451,7 @@ ENTRY(ptregscall_common)
CFI_REL_OFFSET rip, 0
ret
CFI_ENDPROC
+END(ptregscall_common)
ENTRY(stub_execve)
CFI_STARTPROC
@@ -418,6 +466,7 @@ ENTRY(stub_execve)
RESTORE_REST
jmp int_ret_from_sys_call
CFI_ENDPROC
+END(stub_execve)
/*
* sigreturn is special because it needs to restore all registers on return.
@@ -435,12 +484,14 @@ ENTRY(stub_rt_sigreturn)
RESTORE_REST
jmp int_ret_from_sys_call
CFI_ENDPROC
+END(stub_rt_sigreturn)
/*
* initial frame state for interrupts and exceptions
*/
.macro _frame ref
CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,SS+8-\ref
/*CFI_REL_OFFSET ss,SS-\ref*/
CFI_REL_OFFSET rsp,RSP-\ref
@@ -466,29 +517,28 @@ ENTRY(stub_rt_sigreturn)
/* 0(%rsp): interrupt number */
.macro interrupt func
cld
-#ifdef CONFIG_DEBUG_INFO
- SAVE_ALL
- movq %rsp,%rdi
- /*
- * Setup a stack frame pointer. This allows gdb to trace
- * back to the original stack.
- */
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
-#else
SAVE_ARGS
leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
-#endif
+ pushq %rbp
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbp, 0
+ movq %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
testl $3,CS(%rdi)
je 1f
swapgs
-1: incl %gs:pda_irqcount # RED-PEN should check preempt count
- movq %gs:pda_irqstackptr,%rax
- cmoveq %rax,%rsp /*todo This needs CFI annotation! */
- pushq %rdi # save old stack
-#ifndef CONFIG_DEBUG_INFO
- CFI_ADJUST_CFA_OFFSET 8
-#endif
+ /* irqcount is used to check if a CPU is already on an interrupt
+ stack or not. While this is essentially redundant with preempt_count
+ it is a little cheaper to use a separate counter in the PDA
+ (short of moving irq_enter into assembly, which would be too
+ much work) */
+1: incl %gs:pda_irqcount
+ cmoveq %gs:pda_irqstackptr,%rsp
+ push %rbp # backlink for old unwinder
+ /*
+ * We entered an interrupt context - irqs are off:
+ */
+ TRACE_IRQS_OFF
call \func
.endm
@@ -497,17 +547,12 @@ ENTRY(common_interrupt)
interrupt do_IRQ
/* 0(%rsp): oldrsp-ARGOFFSET */
ret_from_intr:
- popq %rdi
-#ifndef CONFIG_DEBUG_INFO
- CFI_ADJUST_CFA_OFFSET -8
-#endif
cli
+ TRACE_IRQS_OFF
decl %gs:pda_irqcount
-#ifdef CONFIG_DEBUG_INFO
- movq RBP(%rdi),%rbp
+ leaveq
CFI_DEF_CFA_REGISTER rsp
-#endif
- leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */
+ CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -526,9 +571,21 @@ retint_check:
CFI_REMEMBER_STATE
jnz retint_careful
retint_swapgs:
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ cli
+ TRACE_IRQS_IRETQ
swapgs
+ jmp restore_args
+
retint_restore_args:
cli
+ /*
+ * The iretq could re-enable interrupts:
+ */
+ TRACE_IRQS_IRETQ
+restore_args:
RESTORE_ARGS 0,8,0
iret_label:
iretq
@@ -541,6 +598,7 @@ iret_label:
/* running with kernel gs */
bad_iret:
movq $11,%rdi /* SIGSEGV */
+ TRACE_IRQS_ON
sti
jmp do_exit
.previous
@@ -550,6 +608,7 @@ retint_careful:
CFI_RESTORE_STATE
bt $TIF_NEED_RESCHED,%edx
jnc retint_signal
+ TRACE_IRQS_ON
sti
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
@@ -558,11 +617,13 @@ retint_careful:
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
cli
+ TRACE_IRQS_OFF
jmp retint_check
retint_signal:
testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
jz retint_swapgs
+ TRACE_IRQS_ON
sti
SAVE_REST
movq $-1,ORIG_RAX(%rsp)
@@ -571,6 +632,7 @@ retint_signal:
call do_notify_resume
RESTORE_REST
cli
+ TRACE_IRQS_OFF
movl $_TIF_NEED_RESCHED,%edi
GET_THREAD_INFO(%rcx)
jmp retint_check
@@ -578,8 +640,7 @@ retint_signal:
#ifdef CONFIG_PREEMPT
/* Returning to kernel space. Check if we need preemption */
/* rcx: threadinfo. interrupts off. */
- .p2align
-retint_kernel:
+ENTRY(retint_kernel)
cmpl $0,threadinfo_preempt_count(%rcx)
jnz retint_restore_args
bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx)
@@ -589,14 +650,16 @@ retint_kernel:
call preempt_schedule_irq
jmp exit_intr
#endif
+
CFI_ENDPROC
+END(common_interrupt)
/*
* APIC interrupts.
*/
.macro apicinterrupt num,func
INTR_FRAME
- pushq $\num-256
+ pushq $~(\num)
CFI_ADJUST_CFA_OFFSET 8
interrupt \func
jmp ret_from_intr
@@ -605,17 +668,21 @@ retint_kernel:
ENTRY(thermal_interrupt)
apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt
+END(thermal_interrupt)
ENTRY(threshold_interrupt)
apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt
+END(threshold_interrupt)
#ifdef CONFIG_SMP
ENTRY(reschedule_interrupt)
apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
+END(reschedule_interrupt)
.macro INVALIDATE_ENTRY num
ENTRY(invalidate_interrupt\num)
apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
+END(invalidate_interrupt\num)
.endm
INVALIDATE_ENTRY 0
@@ -629,18 +696,20 @@ ENTRY(invalidate_interrupt\num)
ENTRY(call_function_interrupt)
apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
+END(call_function_interrupt)
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
ENTRY(apic_timer_interrupt)
apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt
+END(apic_timer_interrupt)
ENTRY(error_interrupt)
apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt
+END(error_interrupt)
ENTRY(spurious_interrupt)
apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
-#endif
+END(spurious_interrupt)
/*
* Exception entry points.
@@ -667,7 +736,7 @@ ENTRY(spurious_interrupt)
/* error code is on the stack already */
/* handle NMI like exceptions that can happen everywhere */
- .macro paranoidentry sym, ist=0
+ .macro paranoidentry sym, ist=0, irqtrace=1
SAVE_ALL
cld
movl $1,%ebx
@@ -692,13 +761,80 @@ ENTRY(spurious_interrupt)
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
.endif
cli
+ .if \irqtrace
+ TRACE_IRQS_OFF
+ .endif
.endm
-
+
+ /*
+ * "Paranoid" exit path from exception stack.
+ * Paranoid because this is used by NMIs and cannot take
+ * any kernel state for granted.
+ * We don't do kernel preemption checks here, because only
+ * NMI should be common and it does not enable IRQs and
+ * cannot get reschedule ticks.
+ *
+ * "trace" is 0 for the NMI handler only, because irq-tracing
+ * is fundamentally NMI-unsafe. (we cannot change the soft and
+ * hard flags at once, atomically)
+ */
+ .macro paranoidexit trace=1
+ /* ebx: no swapgs flag */
+paranoid_exit\trace:
+ testl %ebx,%ebx /* swapgs needed? */
+ jnz paranoid_restore\trace
+ testl $3,CS(%rsp)
+ jnz paranoid_userspace\trace
+paranoid_swapgs\trace:
+ .if \trace
+ TRACE_IRQS_IRETQ 0
+ .endif
+ swapgs
+paranoid_restore\trace:
+ RESTORE_ALL 8
+ iretq
+paranoid_userspace\trace:
+ GET_THREAD_INFO(%rcx)
+ movl threadinfo_flags(%rcx),%ebx
+ andl $_TIF_WORK_MASK,%ebx
+ jz paranoid_swapgs\trace
+ movq %rsp,%rdi /* &pt_regs */
+ call sync_regs
+ movq %rax,%rsp /* switch stack for scheduling */
+ testl $_TIF_NEED_RESCHED,%ebx
+ jnz paranoid_schedule\trace
+ movl %ebx,%edx /* arg3: thread flags */
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+ sti
+ xorl %esi,%esi /* arg2: oldset */
+ movq %rsp,%rdi /* arg1: &pt_regs */
+ call do_notify_resume
+ cli
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+ jmp paranoid_userspace\trace
+paranoid_schedule\trace:
+ .if \trace
+ TRACE_IRQS_ON
+ .endif
+ sti
+ call schedule
+ cli
+ .if \trace
+ TRACE_IRQS_OFF
+ .endif
+ jmp paranoid_userspace\trace
+ CFI_ENDPROC
+ .endm
+
/*
* Exception entry point. This expects an error code/orig_rax on the stack
* and the exception handler in %rax.
*/
-ENTRY(error_entry)
+KPROBE_ENTRY(error_entry)
_frame RDI
/* rdi slot contains rax, oldrax contains error code */
cld
@@ -749,6 +885,7 @@ error_exit:
movl %ebx,%eax
RESTORE_REST
cli
+ TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
testl %eax,%eax
jne retint_kernel
@@ -756,6 +893,10 @@ error_exit:
movl $_TIF_WORK_MASK,%edi
andl %edi,%edx
jnz retint_careful
+ /*
+ * The iret might restore flags:
+ */
+ TRACE_IRQS_IRETQ
swapgs
RESTORE_ARGS 0,8,0
jmp iret_label
@@ -777,6 +918,7 @@ error_kernelspace:
cmpq $gs_change,RIP(%rsp)
je error_swapgs
jmp error_sti
+KPROBE_END(error_entry)
/* Reload gs selector with exception handling */
/* edi: new selector */
@@ -794,6 +936,7 @@ gs_change:
CFI_ADJUST_CFA_OFFSET -8
ret
CFI_ENDPROC
+ENDPROC(load_gs_index)
.section __ex_table,"a"
.align 8
@@ -847,9 +990,11 @@ ENTRY(kernel_thread)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
-
+ENDPROC(kernel_thread)
child_rip:
+ pushq $0 # fake return address
+ CFI_STARTPROC
/*
* Here we are in the child and the registers are set as they were
* at kernel_thread() invocation in the parent.
@@ -860,6 +1005,8 @@ child_rip:
# exit
xorl %edi, %edi
call do_exit
+ CFI_ENDPROC
+ENDPROC(child_rip)
/*
* execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -889,19 +1036,23 @@ ENTRY(execve)
UNFAKE_STACK_FRAME
ret
CFI_ENDPROC
+ENDPROC(execve)
KPROBE_ENTRY(page_fault)
errorentry do_page_fault
- .previous .text
+KPROBE_END(page_fault)
ENTRY(coprocessor_error)
zeroentry do_coprocessor_error
+END(coprocessor_error)
ENTRY(simd_coprocessor_error)
zeroentry do_simd_coprocessor_error
+END(simd_coprocessor_error)
ENTRY(device_not_available)
zeroentry math_state_restore
+END(device_not_available)
/* runs on exception stack */
KPROBE_ENTRY(debug)
@@ -909,116 +1060,91 @@ KPROBE_ENTRY(debug)
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_debug, DEBUG_STACK
- jmp paranoid_exit
- CFI_ENDPROC
- .previous .text
+ paranoidexit
+KPROBE_END(debug)
/* runs on exception stack */
KPROBE_ENTRY(nmi)
INTR_FRAME
pushq $-1
CFI_ADJUST_CFA_OFFSET 8
- paranoidentry do_nmi
- /*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
- */
- /* ebx: no swapgs flag */
-paranoid_exit:
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
- swapgs
-paranoid_restore:
- RESTORE_ALL 8
- iretq
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl threadinfo_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- sti
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- cli
- jmp paranoid_userspace
-paranoid_schedule:
- sti
- call schedule
- cli
- jmp paranoid_userspace
- CFI_ENDPROC
- .previous .text
+ paranoidentry do_nmi, 0, 0
+#ifdef CONFIG_TRACE_IRQFLAGS
+ paranoidexit 0
+#else
+ jmp paranoid_exit1
+ CFI_ENDPROC
+#endif
+KPROBE_END(nmi)
KPROBE_ENTRY(int3)
INTR_FRAME
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_int3, DEBUG_STACK
- jmp paranoid_exit
+ jmp paranoid_exit1
CFI_ENDPROC
- .previous .text
+KPROBE_END(int3)
ENTRY(overflow)
zeroentry do_overflow
+END(overflow)
ENTRY(bounds)
zeroentry do_bounds
+END(bounds)
ENTRY(invalid_op)
zeroentry do_invalid_op
+END(invalid_op)
ENTRY(coprocessor_segment_overrun)
zeroentry do_coprocessor_segment_overrun
+END(coprocessor_segment_overrun)
ENTRY(reserved)
zeroentry do_reserved
+END(reserved)
/* runs on exception stack */
ENTRY(double_fault)
XCPT_FRAME
paranoidentry do_double_fault
- jmp paranoid_exit
+ jmp paranoid_exit1
CFI_ENDPROC
+END(double_fault)
ENTRY(invalid_TSS)
errorentry do_invalid_TSS
+END(invalid_TSS)
ENTRY(segment_not_present)
errorentry do_segment_not_present
+END(segment_not_present)
/* runs on exception stack */
ENTRY(stack_segment)
XCPT_FRAME
paranoidentry do_stack_segment
- jmp paranoid_exit
+ jmp paranoid_exit1
CFI_ENDPROC
+END(stack_segment)
KPROBE_ENTRY(general_protection)
errorentry do_general_protection
- .previous .text
+KPROBE_END(general_protection)
ENTRY(alignment_check)
errorentry do_alignment_check
+END(alignment_check)
ENTRY(divide_error)
zeroentry do_divide_error
+END(divide_error)
ENTRY(spurious_interrupt_bug)
zeroentry do_spurious_interrupt_bug
+END(spurious_interrupt_bug)
#ifdef CONFIG_X86_MCE
/* runs on exception stack */
@@ -1027,22 +1153,60 @@ ENTRY(machine_check)
pushq $0
CFI_ADJUST_CFA_OFFSET 8
paranoidentry do_machine_check
- jmp paranoid_exit
+ jmp paranoid_exit1
CFI_ENDPROC
+END(machine_check)
#endif
+/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
- movq %gs:pda_irqstackptr,%rax
- movq %rsp,%rdx
- CFI_DEF_CFA_REGISTER rdx
+ push %rbp
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbp,0
+ mov %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
incl %gs:pda_irqcount
- cmove %rax,%rsp
- pushq %rdx
- /*todo CFI_DEF_CFA_EXPRESSION ...*/
+ cmove %gs:pda_irqstackptr,%rsp
+ push %rbp # backlink for old unwinder
call __do_softirq
- popq %rsp
+ leaveq
CFI_DEF_CFA_REGISTER rsp
+ CFI_ADJUST_CFA_OFFSET -8
decl %gs:pda_irqcount
ret
CFI_ENDPROC
+ENDPROC(call_softirq)
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movq %r15, R15(%rdi)
+ movq %r14, R14(%rdi)
+ xchgq %rsi, %rdx
+ movq %r13, R13(%rdi)
+ movq %r12, R12(%rdi)
+ xorl %eax, %eax
+ movq %rbp, RBP(%rdi)
+ movq %rbx, RBX(%rdi)
+ movq (%rsp), %rcx
+ movq %rax, R11(%rdi)
+ movq %rax, R10(%rdi)
+ movq %rax, R9(%rdi)
+ movq %rax, R8(%rdi)
+ movq %rax, RAX(%rdi)
+ movq %rax, RCX(%rdi)
+ movq %rax, RDX(%rdi)
+ movq %rax, RSI(%rdi)
+ movq %rax, RDI(%rdi)
+ movq %rax, ORIG_RAX(%rdi)
+ movq %rcx, RIP(%rdi)
+ leaq 8(%rsp), %rcx
+ movq $__KERNEL_CS, CS(%rdi)
+ movq %rax, EFLAGS(%rdi)
+ movq %rcx, RSP(%rdi)
+ movq $__KERNEL_DS, SS(%rdi)
+ jmpq *%rdx
+ CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist
index 2bcebdc3eedb..01fa23580c85 100644
--- a/arch/x86_64/kernel/functionlist
+++ b/arch/x86_64/kernel/functionlist
@@ -384,7 +384,6 @@
*(.text.__end_that_request_first)
*(.text.wake_up_bit)
*(.text.unuse_mm)
-*(.text.skb_release_data)
*(.text.shrink_icache_memory)
*(.text.sched_balance_self)
*(.text.__pmd_alloc)
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 7a64ea181788..8e78a75d1866 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -8,7 +8,6 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/config.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c
index 43fcf62fef0f..cdb90e671b88 100644
--- a/arch/x86_64/kernel/genapic_cluster.c
+++ b/arch/x86_64/kernel/genapic_cluster.c
@@ -9,7 +9,6 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/config.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
@@ -119,7 +118,6 @@ struct genapic apic_cluster = {
.name = "clustered",
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
.target_cpus = cluster_target_cpus,
.apic_id_registered = cluster_apic_id_registered,
.init_apic_ldr = cluster_init_apic_ldr,
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index 1a2ab825be98..50ad153eaac4 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -8,7 +8,6 @@
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
*/
-#include <linux/config.h>
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
@@ -50,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
unsigned long cfg;
unsigned long flags;
- local_save_flags(flags);
- local_irq_disable();
+ local_irq_save(flags);
/*
* Wait for idle.
@@ -78,22 +76,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
static void flat_send_IPI_allbutself(int vector)
{
-#ifndef CONFIG_HOTPLUG_CPU
- if (((num_online_cpus()) - 1) >= 1)
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+#ifdef CONFIG_HOTPLUG_CPU
+ int hotplug = 1;
#else
- cpumask_t allbutme = cpu_online_map;
+ int hotplug = 0;
+#endif
+ if (hotplug || vector == NMI_VECTOR) {
+ cpumask_t allbutme = cpu_online_map;
- cpu_clear(smp_processor_id(), allbutme);
+ cpu_clear(smp_processor_id(), allbutme);
- if (!cpus_empty(allbutme))
- flat_send_IPI_mask(allbutme, vector);
-#endif
+ if (!cpus_empty(allbutme))
+ flat_send_IPI_mask(allbutme, vector);
+ } else if (num_online_cpus() > 1) {
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
+ }
}
static void flat_send_IPI_all(int vector)
{
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+ if (vector == NMI_VECTOR)
+ flat_send_IPI_mask(cpu_online_map, vector);
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
}
static int flat_apic_id_registered(void)
@@ -108,17 +113,13 @@ static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
static unsigned int phys_pkg_id(int index_msb)
{
- u32 ebx;
-
- ebx = cpuid_ebx(1);
- return ((ebx >> 24) & 0xFF) >> index_msb;
+ return hard_smp_processor_id() >> index_msb;
}
struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
.int_dest_mode = (APIC_DEST_LOGICAL != 0),
- .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST,
.target_cpus = flat_target_cpus,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,
@@ -177,7 +178,6 @@ struct genapic apic_physflat = {
.name = "physical flat",
.int_delivery_mode = dest_Fixed,
.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
- .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED,
.target_cpus = physflat_target_cpus,
.apic_id_registered = flat_apic_id_registered,
.init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 6df05e6034fa..1e6f80870679 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -5,8 +5,6 @@
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
* Copyright (C) 2000 Karsten Keil <kkeil@suse.de>
* Copyright (C) 2001,2002 Andi Kleen <ak@suse.de>
- *
- * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $
*/
@@ -187,11 +185,15 @@ startup_64:
/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
- * to the full 64bit address , this is only possible as indirect
- * jump
+ * to the full 64bit address, this is only possible as indirect
+ * jump. In addition we need to ensure %cs is set so we make this
+ * a far return.
*/
movq initial_code(%rip),%rax
- jmp *%rax
+ pushq $0 # fake return address to stop unwinder
+ pushq $__KERNEL_CS # set correct cs
+ pushq %rax # target address in negative space
+ lretq
/* SMP bootup changes these two */
.align 8
@@ -370,7 +372,7 @@ ENTRY(cpu_gdt_table)
.quad 0,0 /* TSS */
.quad 0,0 /* LDT */
.quad 0,0,0 /* three TLS descriptors */
- .quad 0 /* unused */
+ .quad 0x0000f40000000000 /* node/CPU stored in limit */
gdt_end:
/* asm/segment.h:GDT_ENTRIES must match this */
/* This should be a multiple of the cache line size */
diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index cea20a66c150..9561eb3c5b5c 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -2,8 +2,6 @@
* linux/arch/x86_64/kernel/head64.c -- prepare to run common code
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
- *
- * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $
*/
#include <linux/init.h>
@@ -47,38 +45,16 @@ static void __init copy_bootdata(char *real_mode_data)
new_data = *(int *) (x86_boot_params + NEW_CL_POINTER);
if (!new_data) {
if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) {
- printk("so old bootloader that it does not support commandline?!\n");
return;
}
new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET;
- printk("old bootloader convention, maybe loadlin?\n");
}
command_line = (char *) ((u64)(new_data));
memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE);
- printk("Bootdata ok (command line is %s)\n", saved_command_line);
-}
-
-static void __init setup_boot_cpu_data(void)
-{
- unsigned int dummy, eax;
-
- /* get vendor info */
- cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level,
- (unsigned int *)&boot_cpu_data.x86_vendor_id[0],
- (unsigned int *)&boot_cpu_data.x86_vendor_id[8],
- (unsigned int *)&boot_cpu_data.x86_vendor_id[4]);
-
- /* get cpu type */
- cpuid(1, &eax, &dummy, &dummy,
- (unsigned int *) &boot_cpu_data.x86_capability);
- boot_cpu_data.x86 = (eax >> 8) & 0xf;
- boot_cpu_data.x86_model = (eax >> 4) & 0xf;
- boot_cpu_data.x86_mask = eax & 0xf;
}
void __init x86_64_start_kernel(char * real_mode_data)
{
- char *s;
int i;
for (i = 0; i < 256; i++)
@@ -86,6 +62,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
asm volatile("lidt %0" :: "m" (idt_descr));
clear_bss();
+ early_printk("Kernel alive\n");
+
/*
* switch to init_level4_pgt from boot_level4_pgt
*/
@@ -100,22 +78,5 @@ void __init x86_64_start_kernel(char * real_mode_data)
#ifdef CONFIG_SMP
cpu_set(0, cpu_online_map);
#endif
- s = strstr(saved_command_line, "earlyprintk=");
- if (s != NULL)
- setup_early_printk(strchr(s, '=') + 1);
-#ifdef CONFIG_NUMA
- s = strstr(saved_command_line, "numa=");
- if (s != NULL)
- numa_setup(s+5);
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (strstr(saved_command_line, "disableapic"))
- disable_apic = 1;
-#endif
- /* You need early console to see that */
- if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE)
- panic("Kernel too big for kernel mapping\n");
-
- setup_boot_cpu_data();
start_kernel();
}
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index 44ddb1ec808d..3aa1e9bb781d 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -14,7 +14,6 @@
* the 64bit user space sees a FXSAVE frame directly.
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <asm/processor.h>
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 5ecd34ab8c2b..2dd51f364ea2 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -1,5 +1,4 @@
#include <linux/linkage.h>
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
@@ -44,11 +43,11 @@
BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-#define BUILD_14_IRQS(x) \
+#define BUILD_15_IRQS(x) \
BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d)
+ BI(x,c) BI(x,d) BI(x,e)
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -56,7 +55,6 @@
*/
BUILD_16_IRQS(0x0)
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* The IO-APIC gives us many more interrupt sources. Most of these
* are unused but an SMP system is supposed to have enough memory ...
@@ -73,13 +71,11 @@ BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
#ifdef CONFIG_PCI_MSI
- BUILD_14_IRQS(0xe)
-#endif
-
+ BUILD_15_IRQS(0xe)
#endif
#undef BUILD_16_IRQS
-#undef BUILD_14_IRQS
+#undef BUILD_15_IRQS
#undef BI
@@ -92,26 +88,24 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)
IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-#define IRQLIST_14(x) \
+#define IRQLIST_15(x) \
IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d)
+ IRQ(x,c), IRQ(x,d), IRQ(x,e)
void (*interrupt[NR_IRQS])(void) = {
IRQLIST_16(0x0),
-#ifdef CONFIG_X86_IO_APIC
IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3),
IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
IRQLIST_16(0xc), IRQLIST_16(0xd)
#ifdef CONFIG_PCI_MSI
- , IRQLIST_14(0xe)
+ , IRQLIST_15(0xe)
#endif
-#endif
};
#undef IRQ
@@ -129,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = {
DEFINE_SPINLOCK(i8259A_lock);
+static int i8259A_auto_eoi;
+
static void end_8259A_irq (unsigned int irq)
{
if (irq > 256) {
@@ -235,7 +231,7 @@ void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
- irq_desc[irq].handler = &i8259A_irq_type;
+ irq_desc[irq].chip = &i8259A_irq_type;
enable_irq(irq);
}
@@ -278,7 +274,7 @@ static void mask_and_ack_8259A(unsigned int irq)
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
* of hardware problems, so we only do the checks we can
- * do without slowing down good hardware unnecesserily.
+ * do without slowing down good hardware unnecessarily.
*
* Note that IRQ7 and IRQ15 (the two spurious IRQs
* usually resulting from the 8259A-1|2 PICs) occur
@@ -342,6 +338,8 @@ void init_8259A(int auto_eoi)
{
unsigned long flags;
+ i8259A_auto_eoi = auto_eoi;
+
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, 0x21); /* mask all of 8259A-1 */
@@ -400,7 +398,7 @@ static void save_ELCR(char *trigger)
static int i8259A_resume(struct sys_device *dev)
{
- init_8259A(0);
+ init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
@@ -454,9 +452,7 @@ void __init init_ISA_irqs (void)
{
int i;
-#ifdef CONFIG_X86_LOCAL_APIC
init_bsp_APIC();
-#endif
init_8259A(0);
for (i = 0; i < NR_IRQS; i++) {
@@ -468,12 +464,12 @@ void __init init_ISA_irqs (void)
/*
* 16 old-style INTA-cycle interrupts:
*/
- irq_desc[i].handler = &i8259A_irq_type;
+ irq_desc[i].chip = &i8259A_irq_type;
} else {
/*
* 'high' PCI IRQs filled in on demand
*/
- irq_desc[i].handler = &no_irq_type;
+ irq_desc[i].chip = &no_irq_type;
}
}
}
@@ -582,14 +578,12 @@ void __init init_IRQ(void)
set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
/* IPI vectors for APIC spurious and error interrupts */
set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
/*
* Set the clock to HZ Hz, we already have a valid
diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c
index ce31d904d601..3dc5854ba21e 100644
--- a/arch/x86_64/kernel/init_task.c
+++ b/arch/x86_64/kernel/init_task.c
@@ -46,4 +46,9 @@ EXPORT_SYMBOL(init_task);
*/
DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS;
+/* Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
#define ALIGN_TO_4K __attribute__((section(".data.init_task")))
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 9cc7031b7151..0491019d4c8d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -25,7 +25,6 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
-#include <linux/config.h>
#include <linux/smp_lock.h>
#include <linux/mc146818rtc.h>
#include <linux/acpi.h>
@@ -41,6 +40,7 @@
#include <asm/mach_apic.h>
#include <asm/acpi.h>
#include <asm/dma.h>
+#include <asm/nmi.h>
#define __apicdebuginit __init
@@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */
static int no_timer_check;
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
int timer_over_8254 __initdata = 0;
@@ -56,6 +56,7 @@ int timer_over_8254 __initdata = 0;
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
/*
* # of IRQ routing registers
@@ -110,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
FINAL; \
}
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ union entry_union eu;
+ eu.entry = e;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
#ifdef CONFIG_SMP
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
{
@@ -195,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
return;
/*
@@ -209,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
}
static void clear_IO_APIC (void)
@@ -224,14 +245,6 @@ static void clear_IO_APIC (void)
clear_IO_APIC_pin(apic, pin);
}
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
int skip_ioapic_setup;
int ioapic_force;
@@ -240,18 +253,17 @@ int ioapic_force;
static int __init disable_ioapic_setup(char *str)
{
skip_ioapic_setup = 1;
- return 1;
+ return 0;
}
+early_param("noapic", disable_ioapic_setup);
-static int __init enable_ioapic_setup(char *str)
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
{
- ioapic_force = 1;
- skip_ioapic_setup = 0;
+ disable_timer_pin_1 = 1;
return 1;
}
-
-__setup("noapic", disable_ioapic_setup);
-__setup("apic", enable_ioapic_setup);
+__setup("disable_timer_pin_1", disable_timer_pin_setup);
static int __init setup_disable_8254_timer(char *s)
{
@@ -267,135 +279,6 @@ static int __init setup_enable_8254_timer(char *s)
__setup("disable_8254_timer", setup_disable_8254_timer);
__setup("enable_8254_timer", setup_enable_8254_timer);
-#include <asm/pci-direct.h>
-#include <linux/pci_ids.h>
-#include <linux/pci.h>
-
-
-#ifdef CONFIG_ACPI
-
-static int nvidia_hpet_detected __initdata;
-
-static int __init nvidia_hpet_check(unsigned long phys, unsigned long size)
-{
- nvidia_hpet_detected = 1;
- return 0;
-}
-#endif
-
-/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC
- off. Check for an Nvidia or VIA PCI bridge and turn it off.
- Use pci direct infrastructure because this runs before the PCI subsystem.
-
- Can be overwritten with "apic"
-
- And another hack to disable the IOMMU on VIA chipsets.
-
- ... and others. Really should move this somewhere else.
-
- Kludge-O-Rama. */
-void __init check_ioapic(void)
-{
- int num,slot,func;
- /* Poor man's PCI discovery */
- for (num = 0; num < 32; num++) {
- for (slot = 0; slot < 32; slot++) {
- for (func = 0; func < 8; func++) {
- u32 class;
- u32 vendor;
- u8 type;
- class = read_pci_config(num,slot,func,
- PCI_CLASS_REVISION);
- if (class == 0xffffffff)
- break;
-
- if ((class >> 16) != PCI_CLASS_BRIDGE_PCI)
- continue;
-
- vendor = read_pci_config(num, slot, func,
- PCI_VENDOR_ID);
- vendor &= 0xffff;
- switch (vendor) {
- case PCI_VENDOR_ID_VIA:
-#ifdef CONFIG_GART_IOMMU
- if ((end_pfn > MAX_DMA32_PFN ||
- force_iommu) &&
- !iommu_aperture_allowed) {
- printk(KERN_INFO
- "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n");
- iommu_aperture_disabled = 1;
- }
-#endif
- return;
- case PCI_VENDOR_ID_NVIDIA:
-#ifdef CONFIG_ACPI
- /*
- * All timer overrides on Nvidia are
- * wrong unless HPET is enabled.
- */
- nvidia_hpet_detected = 0;
- acpi_table_parse(ACPI_HPET,
- nvidia_hpet_check);
- if (nvidia_hpet_detected == 0) {
- acpi_skip_timer_override = 1;
- printk(KERN_INFO "Nvidia board "
- "detected. Ignoring ACPI "
- "timer override.\n");
- }
-#endif
- /* RED-PEN skip them on mptables too? */
- return;
-
- /* This should be actually default, but
- for 2.6.16 let's do it for ATI only where
- it's really needed. */
- case PCI_VENDOR_ID_ATI:
- if (timer_over_8254 == 1) {
- timer_over_8254 = 0;
- printk(KERN_INFO
- "ATI board detected. Disabling timer routing over 8254.\n");
- }
- return;
- }
-
-
- /* No multi-function device? */
- type = read_pci_config_byte(num,slot,func,
- PCI_HEADER_TYPE);
- if (!(type & 0x80))
- break;
- }
- }
- }
-}
-
-static int __init ioapic_pirq_setup(char *str)
-{
- int i, max;
- int ints[MAX_PIRQS+1];
-
- get_options(str, ARRAY_SIZE(ints), ints);
-
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
-
- pirqs_enabled = 1;
- apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n");
- max = MAX_PIRQS;
- if (ints[0] < MAX_PIRQS)
- max = ints[0];
-
- for (i = 0; i < max; i++) {
- apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
- /*
- * PIRQs are mapped upside down, usually.
- */
- pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
- }
- return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
/*
* Find the IRQ entry number of a certain pin.
@@ -424,9 +307,7 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
@@ -442,9 +323,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
break;
@@ -484,7 +363,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
break;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+ if (!test_bit(lbus, mp_bus_not_pci) &&
!mp_irqs[i].mpc_irqtype &&
(bus == lbus) &&
(slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -507,27 +386,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
return best_guess;
}
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < 16) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx) (0)
-
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
@@ -540,12 +398,6 @@ static int EISA_ELCR(unsigned int irq)
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) (0)
-
static int __init MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mpc_srcbus;
@@ -557,38 +409,11 @@ static int __init MPBIOS_polarity(int idx)
switch (mp_irqs[idx].mpc_irqflag & 3)
{
case 0: /* conforms, ie. bus-type dependent polarity */
- {
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- polarity = default_ISA_polarity(idx);
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- polarity = default_EISA_polarity(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- polarity = default_PCI_polarity(idx);
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- polarity = default_MCA_polarity(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
+ if (test_bit(bus, mp_bus_not_pci))
+ polarity = default_ISA_polarity(idx);
+ else
+ polarity = default_PCI_polarity(idx);
break;
- }
case 1: /* high active */
{
polarity = 0;
@@ -626,38 +451,11 @@ static int MPBIOS_trigger(int idx)
switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
{
case 0: /* conforms, ie. bus-type dependent */
- {
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- trigger = default_ISA_trigger(idx);
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- trigger = default_EISA_trigger(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- trigger = default_PCI_trigger(idx);
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- trigger = default_MCA_trigger(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- trigger = 1;
- break;
- }
- }
+ if (test_bit(bus, mp_bus_not_pci))
+ trigger = default_ISA_trigger(idx);
+ else
+ trigger = default_PCI_trigger(idx);
break;
- }
case 1: /* edge */
{
trigger = 0;
@@ -763,49 +561,17 @@ static int pin_2_irq(int idx, int apic, int pin)
if (mp_irqs[idx].mpc_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- case MP_BUS_EISA:
- case MP_BUS_MCA:
- {
- irq = mp_irqs[idx].mpc_srcbusirq;
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
- irq = gsi_irq_sharing(irq);
- break;
- }
- default:
- {
- printk(KERN_ERR "unknown bus type %d.\n",bus);
- irq = 0;
- break;
- }
- }
- BUG_ON(irq >= NR_IRQS);
-
- /*
- * PCI IRQ command line redirection. Yes, limits are hardcoded.
- */
- if ((pin >= 16) && (pin <= 23)) {
- if (pirq_entries[pin-16] != -1) {
- if (!pirq_entries[pin-16]) {
- apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16);
- } else {
- irq = pirq_entries[pin-16];
- apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n",
- pin-16, irq);
- }
- }
+ if (test_bit(bus, mp_bus_not_pci)) {
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ } else {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
+ irq = gsi_irq_sharing(irq);
}
BUG_ON(irq >= NR_IRQS);
return irq;
@@ -834,10 +600,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
int assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+ unsigned long flags;
+ int vector;
BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
- if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
+
+ spin_lock_irqsave(&vector_lock, flags);
+
+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
+ spin_unlock_irqrestore(&vector_lock, flags);
return IO_APIC_VECTOR(irq);
+ }
next:
current_vector += 8;
if (current_vector == IA32_SYSCALL_VECTOR)
@@ -849,11 +622,14 @@ next:
current_vector = FIRST_DEVICE_VECTOR + offset;
}
- vector_irq[current_vector] = irq;
+ vector = current_vector;
+ vector_irq[vector] = irq;
if (irq != AUTO_ASSIGN)
- IO_APIC_VECTOR(irq) = current_vector;
+ IO_APIC_VECTOR(irq) = vector;
+
+ spin_unlock_irqrestore(&vector_lock, flags);
- return current_vector;
+ return vector;
}
extern void (*interrupt[NR_IRQS])(void);
@@ -864,23 +640,18 @@ static struct hw_interrupt_type ioapic_edge_type;
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
-static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
{
- if (use_pci_vector() && !platform_legacy_irq(irq)) {
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- irq_desc[vector].handler = &ioapic_level_type;
- else
- irq_desc[vector].handler = &ioapic_edge_type;
- set_intr_gate(vector, interrupt[vector]);
- } else {
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- irq_desc[irq].handler = &ioapic_level_type;
- else
- irq_desc[irq].handler = &ioapic_edge_type;
- set_intr_gate(vector, interrupt[irq]);
- }
+ unsigned idx;
+
+ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ irq_desc[idx].chip = &ioapic_level_type;
+ else
+ irq_desc[idx].chip = &ioapic_edge_type;
+ set_intr_gate(vector, interrupt[idx]);
}
static void __init setup_IO_APIC_irqs(void)
@@ -937,9 +708,9 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
+ ioapic_write_entry(apic, pin, entry);
+
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -981,7 +752,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
* The timer IRQ doesn't have to know that behind the
* scene we have a 8259A-master in AEOI mode ...
*/
- irq_desc[0].handler = &ioapic_edge_type;
+ irq_desc[0].chip = &ioapic_edge_type;
/*
* Add it to the IO-APIC irq-routing table:
@@ -1077,10 +848,7 @@ void __apicdebuginit print_IO_APIC(void)
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
- *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, i);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
@@ -1275,9 +1043,6 @@ static void __init enable_IO_APIC(void)
irq_2_pin[i].pin = -1;
irq_2_pin[i].next = 0;
}
- if (!pirqs_enabled)
- for (i = 0; i < MAX_PIRQS; i++)
- pirq_entries[i] = -1;
/*
* The number of IO-APIC IRQ registers (== #pins):
@@ -1293,11 +1058,7 @@ static void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
+ entry = ioapic_read_entry(apic, pin);
/* If the interrupt line is enabled and in ExtInt mode
* I have found the pin where the i8259 is connected.
@@ -1349,7 +1110,6 @@ void disable_IO_APIC(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry, 0, sizeof(entry));
entry.mask = 0; /* Enabled */
@@ -1366,84 +1126,13 @@ void disable_IO_APIC(void)
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
- *(((int *)&entry)+1));
- io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
- *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc (void)
-{
- union IO_APIC_reg_00 reg_00;
- int apic;
- int i;
- unsigned char old_id;
- unsigned long flags;
-
- /*
- * Set the IOAPIC ID to the value stored in the MPC table.
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
-
- /* Read the register 0 value */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- old_id = mp_ioapics[apic].mpc_apicid;
-
-
- printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid);
-
-
- /*
- * We need to adjust the IRQ routing table
- * if the ID changed.
- */
- if (old_id != mp_ioapics[apic].mpc_apicid)
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].mpc_dstapic == old_id)
- mp_irqs[i].mpc_dstapic
- = mp_ioapics[apic].mpc_apicid;
-
- /*
- * Read the right value from the MPC table and
- * write it into the ID register.
- */
- apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
- mp_ioapics[apic].mpc_apicid);
-
- reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0, reg_00.raw);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- /*
- * Sanity check
- */
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_00.raw = io_apic_read(apic, 0);
- spin_unlock_irqrestore(&ioapic_lock, flags);
- if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
- printk("could not set ID!\n");
- else
- apic_printk(APIC_VERBOSE," ok.\n");
- }
-}
-
-/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
*
@@ -1611,6 +1300,13 @@ static void set_ioapic_affinity_vector (unsigned int vector,
#endif // CONFIG_SMP
#endif // CONFIG_PCI_MSI
+static int ioapic_retrigger(unsigned int irq)
+{
+ send_IPI_self(IO_APIC_VECTOR(irq));
+
+ return 1;
+}
+
/*
* Level and edge triggered IO-APIC interrupts need different handling,
* so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -1631,6 +1327,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
#ifdef CONFIG_SMP
.set_affinity = set_ioapic_affinity,
#endif
+ .retrigger = ioapic_retrigger,
};
static struct hw_interrupt_type ioapic_level_type __read_mostly = {
@@ -1644,6 +1341,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = {
#ifdef CONFIG_SMP
.set_affinity = set_ioapic_affinity,
#endif
+ .retrigger = ioapic_retrigger,
};
static inline void init_IO_APIC_traps(void)
@@ -1678,7 +1376,7 @@ static inline void init_IO_APIC_traps(void)
make_8259A_irq(irq);
else
/* Strange. Oh, well.. */
- irq_desc[irq].handler = &no_irq_type;
+ irq_desc[irq].chip = &no_irq_type;
}
}
}
@@ -1895,7 +1593,7 @@ static inline void check_timer(void)
apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
disable_8259A_irq(0);
- irq_desc[0].handler = &lapic_irq_type;
+ irq_desc[0].chip = &lapic_irq_type;
apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -1949,11 +1647,6 @@ void __init setup_IO_APIC(void)
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
- /*
- * Set up the IO-APIC IRQ routing table.
- */
- if (!acpi_ioapic)
- setup_ioapic_ids_from_mpc();
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
@@ -1972,17 +1665,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
- unsigned long flags;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
- *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
+ *entry = ioapic_read_entry(dev->id, i);
return 0;
}
@@ -2004,11 +1692,9 @@ static int ioapic_resume(struct sys_device *dev)
reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
- io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
- }
spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
+ ioapic_write_entry(dev->id, i, entry[i]);
return 0;
}
@@ -2062,19 +1748,6 @@ device_initcall(ioapic_init_sysfs);
#define IO_APIC_MAX_ID 0xFE
-int __init io_apic_get_version (int ioapic)
-{
- union IO_APIC_reg_01 reg_01;
- unsigned long flags;
-
- spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(ioapic, 1);
- spin_unlock_irqrestore(&ioapic_lock, flags);
-
- return reg_01.bits.version;
-}
-
-
int __init io_apic_get_redir_entries (int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -2133,10 +1806,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
+ ioapic_write_entry(ioapic, pin, entry);
+
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
- set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
+ set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
return 0;
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index b81614970ecc..fe063d3cfe42 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
}
/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index d8bd0b345b1e..b3677e6ccc6e 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -20,10 +20,29 @@
#include <asm/idle.h>
atomic_t irq_err_count;
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
-atomic_t irq_mis_count;
-#endif
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+/*
+ * Probabilistic stack overflow check:
+ *
+ * Only check the stack in process context, because everything else
+ * runs on the big interrupt stacks. Checking reliably is too expensive,
+ * so we just check from interrupts.
+ */
+static inline void stack_overflow_check(struct pt_regs *regs)
+{
+ u64 curbase = (u64) current->thread_info;
+ static unsigned long warned = -60*HZ;
+
+ if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE &&
+ regs->rsp < curbase + sizeof(struct thread_info) + 128 &&
+ time_after(jiffies, warned + 60*HZ)) {
+ printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n",
+ current->comm, curbase, regs->rsp);
+ show_stack(NULL,NULL);
+ warned = jiffies;
+ }
+}
#endif
/*
@@ -39,7 +58,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (i == 0) {
seq_printf(p, " ");
for_each_online_cpu(j)
- seq_printf(p, "CPU%d ",j);
+ seq_printf(p, "CPU%-8d",j);
seq_putc(p, '\n');
}
@@ -55,7 +74,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
- seq_printf(p, " %14s", irq_desc[i].handler->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->typename);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
@@ -68,18 +87,11 @@ skip:
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
seq_putc(p, '\n');
-#ifdef CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
for_each_online_cpu(j)
seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
seq_putc(p, '\n');
-#endif
seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#ifdef CONFIG_X86_IO_APIC
-#ifdef APIC_MISMATCH_DEBUG
- seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
-#endif
}
return 0;
}
@@ -91,12 +103,20 @@ skip:
*/
asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
{
- /* high bits used in ret_from_ code */
- unsigned irq = regs->orig_rax & 0xff;
+ /* high bit used in ret_from_ code */
+ unsigned irq = ~regs->orig_rax;
+
+ if (unlikely(irq >= NR_IRQS)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+ __FUNCTION__, irq);
+ BUG();
+ }
exit_idle();
irq_enter();
-
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ stack_overflow_check(regs);
+#endif
__do_IRQ(irq, regs);
irq_exit();
@@ -114,13 +134,13 @@ void fixup_irqs(cpumask_t map)
if (irq == 2)
continue;
- cpus_and(mask, irq_affinity[irq], map);
+ cpus_and(mask, irq_desc[irq].affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].handler->set_affinity)
- irq_desc[irq].handler->set_affinity(irq, mask);
+ if (irq_desc[irq].chip->set_affinity)
+ irq_desc[irq].chip->set_affinity(irq, mask);
else if (irq_desc[irq].action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
@@ -145,8 +165,10 @@ asmlinkage void do_softirq(void)
local_irq_save(flags);
pending = local_softirq_pending();
/* Switch to interrupt stack */
- if (pending)
+ if (pending) {
call_softirq();
+ WARN_ON_ONCE(softirq_count());
+ }
local_irq_restore(flags);
}
EXPORT_SYMBOL(do_softirq);
diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c
new file mode 100644
index 000000000000..6416682d33d0
--- /dev/null
+++ b/arch/x86_64/kernel/k8.c
@@ -0,0 +1,118 @@
+/*
+ * Shared support code for AMD K8 northbridges and derivates.
+ * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
+ */
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <asm/k8.h>
+
+int num_k8_northbridges;
+EXPORT_SYMBOL(num_k8_northbridges);
+
+static u32 *flush_words;
+
+struct pci_device_id k8_nb_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+ {}
+};
+EXPORT_SYMBOL(k8_nb_ids);
+
+struct pci_dev **k8_northbridges;
+EXPORT_SYMBOL(k8_northbridges);
+
+static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+{
+ do {
+ dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+ if (!dev)
+ break;
+ } while (!pci_match_id(&k8_nb_ids[0], dev));
+ return dev;
+}
+
+int cache_k8_northbridges(void)
+{
+ int i;
+ struct pci_dev *dev;
+ if (num_k8_northbridges)
+ return 0;
+
+ num_k8_northbridges = 0;
+ dev = NULL;
+ while ((dev = next_k8_northbridge(dev)) != NULL)
+ num_k8_northbridges++;
+
+ k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
+ GFP_KERNEL);
+ if (!k8_northbridges)
+ return -ENOMEM;
+
+ flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ kfree(k8_northbridges);
+ return -ENOMEM;
+ }
+
+ dev = NULL;
+ i = 0;
+ while ((dev = next_k8_northbridge(dev)) != NULL) {
+ k8_northbridges[i++] = dev;
+ pci_read_config_dword(dev, 0x9c, &flush_words[i]);
+ }
+ k8_northbridges[i] = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+
+/* Ignores subdevice/subvendor but as far as I can figure out
+ they're useless anyways */
+int __init early_is_k8_nb(u32 device)
+{
+ struct pci_device_id *id;
+ u32 vendor = device & 0xffff;
+ device >>= 16;
+ for (id = k8_nb_ids; id->vendor; id++)
+ if (vendor == id->vendor && device == id->device)
+ return 1;
+ return 0;
+}
+
+void k8_flush_garts(void)
+{
+ int flushed, i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(gart_lock);
+
+ /* Avoid races between AGP and IOMMU. In theory it's not needed
+ but I'm not sure if the hardware won't lose flush requests
+ when another is pending. This whole thing is so expensive anyways
+ that it doesn't matter to serialize more. -AK */
+ spin_lock_irqsave(&gart_lock, flags);
+ flushed = 0;
+ for (i = 0; i < num_k8_northbridges; i++) {
+ pci_write_config_dword(k8_northbridges[i], 0x9c,
+ flush_words[i]|1);
+ flushed++;
+ }
+ for (i = 0; i < num_k8_northbridges; i++) {
+ u32 w;
+ /* Make sure the hardware actually executed the flush*/
+ for (;;) {
+ pci_read_config_dword(k8_northbridges[i],
+ 0x9c, &w);
+ if (!(w & 1))
+ break;
+ cpu_relax();
+ }
+ }
+ spin_unlock_irqrestore(&gart_lock, flags);
+ if (!flushed)
+ printk("nothing to flush?\n");
+}
+EXPORT_SYMBOL_GPL(k8_flush_garts);
+
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index fa1d19ca700a..ffc73ac72485 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -31,7 +31,6 @@
* Added function return probes functionality
*/
-#include <linux/config.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/string.h>
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 25ac8a3faae6..0497e3bd5bff 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -15,6 +15,15 @@
#include <asm/mmu_context.h>
#include <asm/io.h>
+#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
+static u64 kexec_pgd[512] PAGE_ALIGNED;
+static u64 kexec_pud0[512] PAGE_ALIGNED;
+static u64 kexec_pmd0[512] PAGE_ALIGNED;
+static u64 kexec_pte0[512] PAGE_ALIGNED;
+static u64 kexec_pud1[512] PAGE_ALIGNED;
+static u64 kexec_pmd1[512] PAGE_ALIGNED;
+static u64 kexec_pte1[512] PAGE_ALIGNED;
+
static void init_level2_page(pmd_t *level2p, unsigned long addr)
{
unsigned long end_addr;
@@ -144,32 +153,19 @@ static void load_segments(void)
);
}
-typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
- unsigned long control_code_buffer,
- unsigned long start_address,
- unsigned long pgtable) ATTRIB_NORET;
-
-const extern unsigned char relocate_new_kernel[];
-const extern unsigned long relocate_new_kernel_size;
-
int machine_kexec_prepare(struct kimage *image)
{
- unsigned long start_pgtable, control_code_buffer;
+ unsigned long start_pgtable;
int result;
/* Calculate the offsets */
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + PAGE_SIZE;
/* Setup the identity mapped 64bit page table */
result = init_pgtable(image, start_pgtable);
if (result)
return result;
- /* Place the code in the reboot code buffer */
- memcpy(__va(control_code_buffer), relocate_new_kernel,
- relocate_new_kernel_size);
-
return 0;
}
@@ -184,37 +180,40 @@ void machine_kexec_cleanup(struct kimage *image)
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
- unsigned long page_list;
- unsigned long control_code_buffer;
- unsigned long start_pgtable;
- relocate_new_kernel_t rnk;
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
- /* Calculate the offsets */
- page_list = image->head;
- start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
- control_code_buffer = start_pgtable + PAGE_SIZE;
-
- /* Set the low half of the page table to my identity mapped
- * page table for kexec. Leave the high half pointing at the
- * kernel pages. Don't bother to flush the global pages
- * as that will happen when I fully switch to my identity mapped
- * page table anyway.
- */
- memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
- __flush_tlb();
-
-
- /* The segment registers are funny things, they are
- * automatically loaded from a table, in memory wherever you
- * set them to a specific selector, but this table is never
- * accessed again unless you set the segment to a different selector.
- *
- * The more common model are caches where the behide
- * the scenes work is done, but is also dropped at arbitrary
- * times.
+ control_page = page_address(image->control_code_page) + PAGE_SIZE;
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[PA_PGD] = __pa(kexec_pgd);
+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
+ page_list[PA_PUD_0] = __pa(kexec_pud0);
+ page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+ page_list[PA_PTE_0] = __pa(kexec_pte0);
+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+ page_list[PA_PUD_1] = __pa(kexec_pud1);
+ page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+ page_list[PA_TABLE_PAGE] =
+ (unsigned long)__pa(page_address(image->control_code_page));
+
+ /* The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
*
* I take advantage of this here by force loading the
* segments, before I zap the gdt with an invalid value.
@@ -225,7 +224,36 @@ NORET_TYPE void machine_kexec(struct kimage *image)
*/
set_gdt(phys_to_virt(0),0);
set_idt(phys_to_virt(0),0);
+
/* now call it */
- rnk = (relocate_new_kernel_t) control_code_buffer;
- (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init setup_crashkernel(char *arg)
+{
+ unsigned long size, base;
+ char *p;
+ if (!arg)
+ return -EINVAL;
+ size = memparse(arg, &p);
+ if (arg == p)
+ return -EINVAL;
+ if (*p == '@') {
+ base = memparse(p+1, &p);
+ /* FIXME: Do I want a sanity check to validate the
+ * memory range? Yes you do, but it's too early for
+ * e820 -AK */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ return 0;
}
+early_param("crashkernel", setup_crashkernel);
+
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index c69fc43cee7b..bbea88801d88 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
goto out2;
memset(&m, 0, sizeof(struct mce));
- m.cpu = safe_smp_processor_id();
+ m.cpu = smp_processor_id();
rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
if (!(m.mcgstatus & MCG_STATUS_RIPV))
kill_it = 1;
@@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code)
atomic_dec(&mce_entry);
}
+#ifdef CONFIG_X86_MCE_INTEL
+/***
+ * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
+ * @cpu: The CPU on which the event occured.
+ * @status: Event status information
+ *
+ * This function should be called by the thermal interrupt after the
+ * event has been processed and the decision was made to log the event
+ * further.
+ *
+ * The status parameter will be saved to the 'status' field of 'struct mce'
+ * and historically has been the register value of the
+ * MSR_IA32_THERMAL_STATUS (Intel) msr.
+ */
+void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+{
+ struct mce m;
+
+ memset(&m, 0, sizeof(m));
+ m.cpu = cpu;
+ m.bank = MCE_THERMAL_BANK;
+ m.status = status;
+ rdtscll(m.tsc);
+ mce_log(&m);
+}
+#endif /* CONFIG_X86_MCE_INTEL */
+
/*
* Periodic polling timer for "silent" machine check errors.
*/
@@ -562,7 +589,7 @@ static struct sysdev_class mce_sysclass = {
set_kset_name("machinecheck"),
};
-static DEFINE_PER_CPU(struct sys_device, device_mce);
+DEFINE_PER_CPU(struct sys_device, device_mce);
/* Why are there no generic functions for this? */
#define ACCESSOR(name, var, start) \
@@ -615,7 +642,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static void mce_remove_device(unsigned int cpu)
{
int i;
@@ -626,7 +653,6 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
sysdev_unregister(&per_cpu(device_mce,cpu));
}
-#endif
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int
@@ -638,11 +664,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_ONLINE:
mce_create_device(cpu);
break;
-#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
mce_remove_device(cpu);
break;
-#endif
}
return NOTIFY_OK;
}
@@ -650,6 +674,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
static struct notifier_block mce_cpu_notifier = {
.notifier_call = mce_cpu_callback,
};
+#endif
static __init int mce_init_device(void)
{
@@ -664,7 +689,7 @@ static __init int mce_init_device(void)
mce_create_device(i);
}
- register_cpu_notifier(&mce_cpu_notifier);
+ register_hotcpu_notifier(&mce_cpu_notifier);
misc_register(&mce_log_device);
return err;
}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index d13b241ad094..883fe747f64c 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -1,5 +1,5 @@
/*
- * (c) 2005 Advanced Micro Devices, Inc.
+ * (c) 2005, 2006 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -8,9 +8,10 @@
*
* Support : jacob.shin@amd.com
*
- * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F.
- * MC4_MISC0 exists per physical processor.
+ * April 2006
+ * - added support for AMD Family 0x10 processors
*
+ * All MC4_MISCi registers are shared between multi-cores
*/
#include <linux/cpu.h>
@@ -29,32 +30,45 @@
#include <asm/percpu.h>
#include <asm/idle.h>
-#define PFX "mce_threshold: "
-#define VERSION "version 1.00.9"
-#define NR_BANKS 5
-#define THRESHOLD_MAX 0xFFF
-#define INT_TYPE_APIC 0x00020000
-#define MASK_VALID_HI 0x80000000
-#define MASK_LVTOFF_HI 0x00F00000
-#define MASK_COUNT_EN_HI 0x00080000
-#define MASK_INT_TYPE_HI 0x00060000
-#define MASK_OVERFLOW_HI 0x00010000
+#define PFX "mce_threshold: "
+#define VERSION "version 1.1.1"
+#define NR_BANKS 6
+#define NR_BLOCKS 9
+#define THRESHOLD_MAX 0xFFF
+#define INT_TYPE_APIC 0x00020000
+#define MASK_VALID_HI 0x80000000
+#define MASK_LVTOFF_HI 0x00F00000
+#define MASK_COUNT_EN_HI 0x00080000
+#define MASK_INT_TYPE_HI 0x00060000
+#define MASK_OVERFLOW_HI 0x00010000
#define MASK_ERR_COUNT_HI 0x00000FFF
-#define MASK_OVERFLOW 0x0001000000000000L
+#define MASK_BLKPTR_LO 0xFF000000
+#define MCG_XBLK_ADDR 0xC0000400
-struct threshold_bank {
+struct threshold_block {
+ unsigned int block;
+ unsigned int bank;
unsigned int cpu;
- u8 bank;
- u8 interrupt_enable;
+ u32 address;
+ u16 interrupt_enable;
u16 threshold_limit;
struct kobject kobj;
+ struct list_head miscj;
};
-static struct threshold_bank threshold_defaults = {
+/* defaults used early on boot */
+static struct threshold_block threshold_defaults = {
.interrupt_enable = 0,
.threshold_limit = THRESHOLD_MAX,
};
+struct threshold_bank {
+ struct kobject kobj;
+ struct threshold_block *blocks;
+ cpumask_t cpus;
+};
+static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+
#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
0, 0, 0, 0, 1
@@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
*/
/* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_bank *b,
+static void threshold_restart_bank(struct threshold_block *b,
int reset, u16 old_limit)
{
u32 mci_misc_hi, mci_misc_lo;
- rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+ rdmsr(b->address, mci_misc_lo, mci_misc_hi);
if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
reset = 1; /* limit cannot be lower than err count */
@@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b,
(mci_misc_hi &= ~MASK_INT_TYPE_HI);
mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi);
+ wrmsr(b->address, mci_misc_lo, mci_misc_hi);
}
+/* cpu init entry point, called from mce.c with preempt off */
void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
{
- int bank;
- u32 mci_misc_lo, mci_misc_hi;
+ unsigned int bank, block;
unsigned int cpu = smp_processor_id();
+ u32 low = 0, high = 0, address = 0;
for (bank = 0; bank < NR_BANKS; ++bank) {
- rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi);
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ if (block == 0)
+ address = MSR_IA32_MC0_MISC + bank * 4;
+ else if (block == 1)
+ address = MCG_XBLK_ADDR
+ + ((low & MASK_BLKPTR_LO) >> 21);
+ else
+ ++address;
+
+ if (rdmsr_safe(address, &low, &high))
+ continue;
- /* !valid, !counter present, bios locked */
- if (!(mci_misc_hi & MASK_VALID_HI) ||
- !(mci_misc_hi & MASK_VALID_HI >> 1) ||
- (mci_misc_hi & MASK_VALID_HI >> 2))
- continue;
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ continue;
+ else
+ break;
+ }
- per_cpu(bank_map, cpu) |= (1 << bank);
+ if (!(high & MASK_VALID_HI >> 1) ||
+ (high & MASK_VALID_HI >> 2))
+ continue;
+ if (!block)
+ per_cpu(bank_map, cpu) |= (1 << bank);
#ifdef CONFIG_SMP
- if (shared_bank[bank] && cpu_core_id[cpu])
- continue;
+ if (shared_bank[bank] && c->cpu_core_id)
+ break;
#endif
+ high &= ~MASK_LVTOFF_HI;
+ high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20;
+ wrmsr(address, low, high);
- setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20);
- threshold_defaults.cpu = cpu;
- threshold_defaults.bank = bank;
- threshold_restart_bank(&threshold_defaults, 0, 0);
+ setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD,
+ THRESHOLD_APIC_VECTOR,
+ K8_APIC_EXT_INT_MSG_FIX, 0);
+
+ threshold_defaults.address = address;
+ threshold_restart_bank(&threshold_defaults, 0, 0);
+ }
}
}
@@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
*/
asmlinkage void mce_threshold_interrupt(void)
{
- int bank;
+ unsigned int bank, block;
struct mce m;
+ u32 low = 0, high = 0, address = 0;
ack_APIC_irq();
exit_idle();
@@ -150,15 +187,42 @@ asmlinkage void mce_threshold_interrupt(void)
/* assume first bank caused it */
for (bank = 0; bank < NR_BANKS; ++bank) {
- m.bank = MCE_THRESHOLD_BASE + bank;
- rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc);
+ for (block = 0; block < NR_BLOCKS; ++block) {
+ if (block == 0)
+ address = MSR_IA32_MC0_MISC + bank * 4;
+ else if (block == 1)
+ address = MCG_XBLK_ADDR
+ + ((low & MASK_BLKPTR_LO) >> 21);
+ else
+ ++address;
+
+ if (rdmsr_safe(address, &low, &high))
+ continue;
- if (m.misc & MASK_OVERFLOW) {
- mce_log(&m);
- goto out;
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ continue;
+ else
+ break;
+ }
+
+ if (!(high & MASK_VALID_HI >> 1) ||
+ (high & MASK_VALID_HI >> 2))
+ continue;
+
+ if (high & MASK_OVERFLOW_HI) {
+ rdmsrl(address, m.misc);
+ rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
+ m.status);
+ m.bank = K8_MCE_THRESHOLD_BASE
+ + bank * NR_BLOCKS
+ + block;
+ mce_log(&m);
+ goto out;
+ }
}
}
- out:
+out:
irq_exit();
}
@@ -166,20 +230,12 @@ asmlinkage void mce_threshold_interrupt(void)
* Sysfs Interface
*/
-static struct sysdev_class threshold_sysclass = {
- set_kset_name("threshold"),
-};
-
-static DEFINE_PER_CPU(struct sys_device, device_threshold);
-
struct threshold_attr {
- struct attribute attr;
- ssize_t(*show) (struct threshold_bank *, char *);
- ssize_t(*store) (struct threshold_bank *, const char *, size_t count);
+ struct attribute attr;
+ ssize_t(*show) (struct threshold_block *, char *);
+ ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
-static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
-
static cpumask_t affinity_set(unsigned int cpu)
{
cpumask_t oldmask = current->cpus_allowed;
@@ -194,15 +250,15 @@ static void affinity_restore(cpumask_t oldmask)
set_cpus_allowed(current, oldmask);
}
-#define SHOW_FIELDS(name) \
- static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \
- { \
- return sprintf(buf, "%lx\n", (unsigned long) b->name); \
- }
+#define SHOW_FIELDS(name) \
+static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
+{ \
+ return sprintf(buf, "%lx\n", (unsigned long) b->name); \
+}
SHOW_FIELDS(interrupt_enable)
SHOW_FIELDS(threshold_limit)
-static ssize_t store_interrupt_enable(struct threshold_bank *b,
+static ssize_t store_interrupt_enable(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
@@ -219,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b,
return end - buf;
}
-static ssize_t store_threshold_limit(struct threshold_bank *b,
+static ssize_t store_threshold_limit(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
@@ -242,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b,
return end - buf;
}
-static ssize_t show_error_count(struct threshold_bank *b, char *buf)
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 high, low;
cpumask_t oldmask;
oldmask = affinity_set(b->cpu);
- rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */
+ rdmsr(b->address, low, high);
affinity_restore(oldmask);
return sprintf(buf, "%x\n",
(high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
}
-static ssize_t store_error_count(struct threshold_bank *b,
+static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
cpumask_t oldmask;
@@ -269,13 +325,13 @@ static ssize_t store_error_count(struct threshold_bank *b,
.store = _store, \
};
-#define ATTR_FIELDS(name) \
- static struct threshold_attr name = \
+#define RW_ATTR(name) \
+static struct threshold_attr name = \
THRESHOLD_ATTR(name, 0644, show_## name, store_## name)
-ATTR_FIELDS(interrupt_enable);
-ATTR_FIELDS(threshold_limit);
-ATTR_FIELDS(error_count);
+RW_ATTR(interrupt_enable);
+RW_ATTR(threshold_limit);
+RW_ATTR(error_count);
static struct attribute *default_attrs[] = {
&interrupt_enable.attr,
@@ -284,12 +340,12 @@ static struct attribute *default_attrs[] = {
NULL
};
-#define to_bank(k) container_of(k,struct threshold_bank,kobj)
-#define to_attr(a) container_of(a,struct threshold_attr,attr)
+#define to_block(k) container_of(k, struct threshold_block, kobj)
+#define to_attr(a) container_of(a, struct threshold_attr, attr)
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
- struct threshold_bank *b = to_bank(kobj);
+ struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->show ? a->show(b, buf) : -EIO;
@@ -299,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
static ssize_t store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
- struct threshold_bank *b = to_bank(kobj);
+ struct threshold_block *b = to_block(kobj);
struct threshold_attr *a = to_attr(attr);
ssize_t ret;
ret = a->store ? a->store(b, buf, count) : -EIO;
@@ -316,69 +372,174 @@ static struct kobj_type threshold_ktype = {
.default_attrs = default_attrs,
};
+static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
+ unsigned int bank,
+ unsigned int block,
+ u32 address)
+{
+ int err;
+ u32 low, high;
+ struct threshold_block *b = NULL;
+
+ if ((bank >= NR_BANKS) || (block >= NR_BLOCKS))
+ return 0;
+
+ if (rdmsr_safe(address, &low, &high))
+ goto recurse;
+
+ if (!(high & MASK_VALID_HI)) {
+ if (block)
+ goto recurse;
+ else
+ return 0;
+ }
+
+ if (!(high & MASK_VALID_HI >> 1) ||
+ (high & MASK_VALID_HI >> 2))
+ goto recurse;
+
+ b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+ memset(b, 0, sizeof(struct threshold_block));
+
+ b->block = block;
+ b->bank = bank;
+ b->cpu = cpu;
+ b->address = address;
+ b->interrupt_enable = 0;
+ b->threshold_limit = THRESHOLD_MAX;
+
+ INIT_LIST_HEAD(&b->miscj);
+
+ if (per_cpu(threshold_banks, cpu)[bank]->blocks)
+ list_add(&b->miscj,
+ &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
+ else
+ per_cpu(threshold_banks, cpu)[bank]->blocks = b;
+
+ kobject_set_name(&b->kobj, "misc%i", block);
+ b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj;
+ b->kobj.ktype = &threshold_ktype;
+ err = kobject_register(&b->kobj);
+ if (err)
+ goto out_free;
+recurse:
+ if (!block) {
+ address = (low & MASK_BLKPTR_LO) >> 21;
+ if (!address)
+ return 0;
+ address += MCG_XBLK_ADDR;
+ } else
+ ++address;
+
+ err = allocate_threshold_blocks(cpu, bank, ++block, address);
+ if (err)
+ goto out_free;
+
+ return err;
+
+out_free:
+ if (b) {
+ kobject_unregister(&b->kobj);
+ kfree(b);
+ }
+ return err;
+}
+
/* symlinks sibling shared banks to first core. first core owns dir/files. */
-static __cpuinit int threshold_create_bank(unsigned int cpu, int bank)
+static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
- int err = 0;
+ int i, err = 0;
struct threshold_bank *b = NULL;
+ cpumask_t oldmask = CPU_MASK_NONE;
+ char name[32];
+
+ sprintf(name, "threshold_bank%i", bank);
#ifdef CONFIG_SMP
- if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */
- char name[16];
- unsigned lcpu = first_cpu(cpu_core_map[cpu]);
- if (cpu_core_id[lcpu])
- goto out; /* first core not up yet */
+ if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */
+ i = first_cpu(cpu_core_map[cpu]);
+
+ /* first core not up yet */
+ if (cpu_data[i].cpu_core_id)
+ goto out;
+
+ /* already linked */
+ if (per_cpu(threshold_banks, cpu)[bank])
+ goto out;
+
+ b = per_cpu(threshold_banks, i)[bank];
- b = per_cpu(threshold_banks, lcpu)[bank];
if (!b)
goto out;
- sprintf(name, "bank%i", bank);
- err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj,
+
+ err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,
&b->kobj, name);
if (err)
goto out;
+
+ b->cpus = cpu_core_map[cpu];
per_cpu(threshold_banks, cpu)[bank] = b;
goto out;
}
#endif
- b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL);
+ b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
err = -ENOMEM;
goto out;
}
memset(b, 0, sizeof(struct threshold_bank));
- b->cpu = cpu;
- b->bank = bank;
- b->interrupt_enable = 0;
- b->threshold_limit = THRESHOLD_MAX;
- kobject_set_name(&b->kobj, "bank%i", bank);
- b->kobj.parent = &per_cpu(device_threshold, cpu).kobj;
- b->kobj.ktype = &threshold_ktype;
-
+ kobject_set_name(&b->kobj, "threshold_bank%i", bank);
+ b->kobj.parent = &per_cpu(device_mce, cpu).kobj;
+#ifndef CONFIG_SMP
+ b->cpus = CPU_MASK_ALL;
+#else
+ b->cpus = cpu_core_map[cpu];
+#endif
err = kobject_register(&b->kobj);
- if (err) {
- kfree(b);
- goto out;
- }
+ if (err)
+ goto out_free;
+
per_cpu(threshold_banks, cpu)[bank] = b;
- out:
+
+ oldmask = affinity_set(cpu);
+ err = allocate_threshold_blocks(cpu, bank, 0,
+ MSR_IA32_MC0_MISC + bank * 4);
+ affinity_restore(oldmask);
+
+ if (err)
+ goto out_free;
+
+ for_each_cpu_mask(i, b->cpus) {
+ if (i == cpu)
+ continue;
+
+ err = sysfs_create_link(&per_cpu(device_mce, i).kobj,
+ &b->kobj, name);
+ if (err)
+ goto out;
+
+ per_cpu(threshold_banks, i)[bank] = b;
+ }
+
+ goto out;
+
+out_free:
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
+ kfree(b);
+out:
return err;
}
/* create dir/files for all valid threshold banks */
static __cpuinit int threshold_create_device(unsigned int cpu)
{
- int bank;
+ unsigned int bank;
int err = 0;
- per_cpu(device_threshold, cpu).id = cpu;
- per_cpu(device_threshold, cpu).cls = &threshold_sysclass;
- err = sysdev_register(&per_cpu(device_threshold, cpu));
- if (err)
- goto out;
-
for (bank = 0; bank < NR_BANKS; ++bank) {
if (!(per_cpu(bank_map, cpu) & 1 << bank))
continue;
@@ -386,7 +547,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
if (err)
goto out;
}
- out:
+out:
return err;
}
@@ -397,89 +558,76 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
* of shared sysfs dir/files, and rest of the cores will be symlinked to it.
*/
-/* cpu hotplug call removes all symlinks before first core dies */
-static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)
+static void deallocate_threshold_block(unsigned int cpu,
+ unsigned int bank)
{
+ struct threshold_block *pos = NULL;
+ struct threshold_block *tmp = NULL;
+ struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank];
+
+ if (!head)
+ return;
+
+ list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
+ kobject_unregister(&pos->kobj);
+ list_del(&pos->miscj);
+ kfree(pos);
+ }
+
+ kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
+ per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+}
+
+static void threshold_remove_bank(unsigned int cpu, int bank)
+{
+ int i = 0;
struct threshold_bank *b;
- char name[16];
+ char name[32];
b = per_cpu(threshold_banks, cpu)[bank];
+
if (!b)
return;
- if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) {
- sprintf(name, "bank%i", bank);
- sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name);
+
+ if (!b->blocks)
+ goto free_out;
+
+ sprintf(name, "threshold_bank%i", bank);
+
+ /* sibling symlink */
+ if (shared_bank[bank] && b->blocks->cpu != cpu) {
+ sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
- } else {
- kobject_unregister(&b->kobj);
- kfree(per_cpu(threshold_banks, cpu)[bank]);
+ return;
}
-}
-static __cpuinit void threshold_remove_device(unsigned int cpu)
-{
- int bank;
-
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & 1 << bank))
+ /* remove all sibling symlinks before unregistering */
+ for_each_cpu_mask(i, b->cpus) {
+ if (i == cpu)
continue;
- threshold_remove_bank(cpu, bank);
+
+ sysfs_remove_link(&per_cpu(device_mce, i).kobj, name);
+ per_cpu(threshold_banks, i)[bank] = NULL;
}
- sysdev_unregister(&per_cpu(device_threshold, cpu));
-}
-/* link all existing siblings when first core comes up */
-static __cpuinit int threshold_create_symlinks(unsigned int cpu)
-{
- int bank, err = 0;
- unsigned int lcpu = 0;
+ deallocate_threshold_block(cpu, bank);
- if (cpu_core_id[cpu])
- return 0;
- for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
- if (lcpu == cpu)
- continue;
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & 1 << bank))
- continue;
- if (!shared_bank[bank])
- continue;
- err = threshold_create_bank(lcpu, bank);
- }
- }
- return err;
+free_out:
+ kobject_unregister(&b->kobj);
+ kfree(b);
+ per_cpu(threshold_banks, cpu)[bank] = NULL;
}
-/* remove all symlinks before first core dies. */
-static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
+static void threshold_remove_device(unsigned int cpu)
{
- int bank;
- unsigned int lcpu = 0;
- if (cpu_core_id[cpu])
- return;
- for_each_cpu_mask(lcpu, cpu_core_map[cpu]) {
- if (lcpu == cpu)
+ unsigned int bank;
+
+ for (bank = 0; bank < NR_BANKS; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & 1 << bank))
continue;
- for (bank = 0; bank < NR_BANKS; ++bank) {
- if (!(per_cpu(bank_map, cpu) & 1 << bank))
- continue;
- if (!shared_bank[bank])
- continue;
- threshold_remove_bank(lcpu, bank);
- }
+ threshold_remove_bank(cpu, bank);
}
}
-#else /* !CONFIG_HOTPLUG_CPU */
-static __cpuinit void threshold_create_symlinks(unsigned int cpu)
-{
-}
-static __cpuinit void threshold_remove_symlinks(unsigned int cpu)
-{
-}
-static void threshold_remove_device(unsigned int cpu)
-{
-}
-#endif
/* get notified when a cpu comes on/off */
static int threshold_cpu_callback(struct notifier_block *nfb,
@@ -494,13 +642,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
threshold_create_device(cpu);
- threshold_create_symlinks(cpu);
- break;
- case CPU_DOWN_PREPARE:
- threshold_remove_symlinks(cpu);
- break;
- case CPU_DOWN_FAILED:
- threshold_create_symlinks(cpu);
break;
case CPU_DEAD:
threshold_remove_device(cpu);
@@ -515,26 +656,20 @@ static int threshold_cpu_callback(struct notifier_block *nfb,
static struct notifier_block threshold_cpu_notifier = {
.notifier_call = threshold_cpu_callback,
};
+#endif /* CONFIG_HOTPLUG_CPU */
static __init int threshold_init_device(void)
{
- int err;
- int lcpu = 0;
-
- err = sysdev_class_register(&threshold_sysclass);
- if (err)
- goto out;
+ unsigned lcpu = 0;
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
- err = threshold_create_device(lcpu);
+ int err = threshold_create_device(lcpu);
if (err)
- goto out;
+ return err;
}
- register_cpu_notifier(&threshold_cpu_notifier);
-
- out:
- return err;
+ register_hotcpu_notifier(&threshold_cpu_notifier);
+ return 0;
}
device_initcall(threshold_init_device);
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 8f533d2c40cb..6551505d8a2c 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -11,36 +11,21 @@
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/idle.h>
-
-static DEFINE_PER_CPU(unsigned long, next_check);
+#include <asm/therm_throt.h>
asmlinkage void smp_thermal_interrupt(void)
{
- struct mce m;
+ __u64 msr_val;
ack_APIC_irq();
exit_idle();
irq_enter();
- if (time_before(jiffies, __get_cpu_var(next_check)))
- goto done;
-
- __get_cpu_var(next_check) = jiffies + HZ*300;
- memset(&m, 0, sizeof(m));
- m.cpu = smp_processor_id();
- m.bank = MCE_THERMAL_BANK;
- rdtscll(m.tsc);
- rdmsrl(MSR_IA32_THERM_STATUS, m.status);
- if (m.status & 0x1) {
- printk(KERN_EMERG
- "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu);
- add_taint(TAINT_MACHINE_CHECK);
- } else {
- printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu);
- }
- mce_log(&m);
-done:
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & 1))
+ mce_log_therm_throt_event(smp_processor_id(), msr_val);
+
irq_exit();
}
@@ -92,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n",
cpu, tm2 ? "TM2" : "TM1");
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
return;
}
diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c
index bac195c74bcc..9d0958ff547f 100644
--- a/arch/x86_64/kernel/module.c
+++ b/arch/x86_64/kernel/module.c
@@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs,
return -ENOSYS;
}
-extern void apply_alternatives(void *start, void *end);
-
int module_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
+ const Elf_Shdr *sechdrs,
+ struct module *me)
{
- const Elf_Shdr *s;
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- /* look for .altinstructions to patch */
- for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
- void *seg;
- if (strcmp(".altinstructions", secstrings + s->sh_name))
- continue;
- seg = (void *)s->sh_addr;
- apply_alternatives(seg, seg + s->sh_size);
- }
+ for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
+ if (!strcmp(".text", secstrings + s->sh_name))
+ text = s;
+ if (!strcmp(".altinstructions", secstrings + s->sh_name))
+ alt = s;
+ if (!strcmp(".smp_locks", secstrings + s->sh_name))
+ locks= s;
+ }
+
+ if (alt) {
+ /* patch .altinstructions */
+ void *aseg = (void *)alt->sh_addr;
+ apply_alternatives(aseg, aseg + alt->sh_size);
+ }
+ if (locks && text) {
+ void *lseg = (void *)locks->sh_addr;
+ void *tseg = (void *)text->sh_addr;
+ alternatives_smp_module_add(me, me->name,
+ lseg, lseg + locks->sh_size,
+ tseg, tseg + text->sh_size);
+ }
return 0;
}
void module_arch_cleanup(struct module *mod)
{
+ alternatives_smp_module_del(mod);
}
diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c
index 083da7e606b1..b8d53dfa9931 100644
--- a/arch/x86_64/kernel/mpparse.c
+++ b/arch/x86_64/kernel/mpparse.c
@@ -16,7 +16,6 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/delay.h>
-#include <linux/config.h>
#include <linux/bootmem.h>
#include <linux/smp_lock.h>
#include <linux/kernel_stat.h>
@@ -42,8 +41,7 @@ int acpi_found_madt;
* Various Linux-internal data structures created from the
* MP-table.
*/
-unsigned char apic_version [MAX_APICS];
-unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
static int mp_current_pci_id = 0;
@@ -57,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
int mp_irq_entries;
int nr_ioapics;
-int pic_mode;
unsigned long mp_lapic_addr = 0;
@@ -72,19 +69,6 @@ unsigned disabled_cpus __initdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-/* ACPI MADT entry parsing functions */
-#ifdef CONFIG_ACPI
-extern struct acpi_boot_flags acpi_boot;
-#ifdef CONFIG_X86_LOCAL_APIC
-extern int acpi_parse_lapic (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header);
-extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_LOCAL_APIC*/
-#ifdef CONFIG_X86_IO_APIC
-extern int acpi_parse_ioapic (acpi_table_entry_header *header);
-#endif /*CONFIG_X86_IO_APIC*/
-#endif /*CONFIG_ACPI*/
-
u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
@@ -109,24 +93,20 @@ static int __init mpf_checksum(unsigned char *mp, int len)
static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
{
int cpu;
- unsigned char ver;
cpumask_t tmp_map;
+ char *bootup_cpu = "";
if (!(m->mpc_cpuflag & CPU_ENABLED)) {
disabled_cpus++;
return;
}
-
- printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n",
- m->mpc_apicid,
- (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8,
- (m->mpc_cpufeature & CPU_MODEL_MASK)>>4,
- m->mpc_apicver);
-
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- Dprintk(" Bootup CPU\n");
+ bootup_cpu = " (Bootup-CPU)";
boot_cpu_id = m->mpc_apicid;
}
+
+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+
if (num_processors >= NR_CPUS) {
printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
" Processor ignored.\n", NR_CPUS);
@@ -137,24 +117,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
cpus_complement(tmp_map, cpu_present_map);
cpu = first_cpu(tmp_map);
-#if MAX_APICS < 255
- if ((int)m->mpc_apicid > MAX_APICS) {
- printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->mpc_apicid, MAX_APICS);
- return;
- }
-#endif
- ver = m->mpc_apicver;
-
physid_set(m->mpc_apicid, phys_cpu_present_map);
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
- ver = 0x10;
- }
- apic_version[m->mpc_apicid] = ver;
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
/*
* bios_cpu_apicid is required to have processors listed
@@ -179,37 +142,42 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
if (strncmp(str, "ISA", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
- } else if (strncmp(str, "EISA", 4) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ set_bit(m->mpc_busid, mp_bus_not_pci);
} else if (strncmp(str, "PCI", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ clear_bit(m->mpc_busid, mp_bus_not_pci);
mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
mp_current_pci_id++;
- } else if (strncmp(str, "MCA", 3) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
} else {
printk(KERN_ERR "Unknown bustype %s\n", str);
}
}
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
+ }
+ return 0;
+}
+
static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
{
if (!(m->mpc_flags & MPC_APIC_USABLE))
return;
- printk("I/O APIC #%d Version %d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n",
- MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
- }
- if (!m->mpc_apicaddr) {
- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
- " found in MP table, skipping!\n");
+ printk("I/O APIC #%d at 0x%X.\n",
+ m->mpc_apicid, m->mpc_apicaddr);
+
+ if (bad_ioapic(m->mpc_apicaddr))
return;
- }
+
mp_ioapics[nr_ioapics] = *m;
nr_ioapics++;
}
@@ -233,19 +201,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
- /*
- * Well it seems all SMP boards in existence
- * use ExtINT/LVT1 == LINT0 and
- * NMI/LVT2 == LINT1 - the following check
- * will show us if this assumptions is false.
- * Until then we do not have to add baggage.
- */
- if ((m->mpc_irqtype == mp_ExtINT) &&
- (m->mpc_destapiclint != 0))
- BUG();
- if ((m->mpc_irqtype == mp_NMI) &&
- (m->mpc_destapiclint != 1))
- BUG();
}
/*
@@ -259,7 +214,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
unsigned char *mpt=((unsigned char *)mpc)+count;
if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
- printk("SMP mptable: bad signature [%c%c%c%c]!\n",
+ printk("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->mpc_signature[0],
mpc->mpc_signature[1],
mpc->mpc_signature[2],
@@ -267,31 +222,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
return 0;
}
if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
- printk("SMP mptable: checksum error!\n");
+ printk("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
mpc->mpc_spec);
return 0;
}
if (!mpc->mpc_lapic) {
- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(str,mpc->mpc_oem,8);
- str[8]=0;
- printk(KERN_INFO "OEM ID: %s ",str);
+ str[8] = 0;
+ printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
memcpy(str,mpc->mpc_productid,12);
- str[12]=0;
- printk("Product ID: %s ",str);
+ str[12] = 0;
+ printk("MPTABLE: Product ID: %s ",str);
- printk("APIC at: 0x%X\n",mpc->mpc_lapic);
+ printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
/* save the local APIC address, it might be non-default */
if (!acpi_lapic)
- mp_lapic_addr = mpc->mpc_lapic;
+ mp_lapic_addr = mpc->mpc_lapic;
/*
* Now process the configuration blocks.
@@ -303,7 +258,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_processor *m=
(struct mpc_config_processor *)mpt;
if (!acpi_lapic)
- MP_processor_info(m);
+ MP_processor_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
@@ -322,8 +277,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_ioapic *m=
(struct mpc_config_ioapic *)mpt;
MP_ioapic_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
case MP_INTSRC:
@@ -332,8 +287,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
(struct mpc_config_intsrc *)mpt;
MP_intsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
case MP_LINTSRC:
@@ -341,15 +296,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
struct mpc_config_lintsrc *m=
(struct mpc_config_lintsrc *)mpt;
MP_lintsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
}
}
clustered_apic_check();
if (!num_processors)
- printk(KERN_ERR "SMP mptable: no processors registered!\n");
+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
}
@@ -445,13 +400,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
* 2 CPUs, numbered 0 & 1.
*/
processor.mpc_type = MP_PROCESSOR;
- /* Either an integrated APIC or a discrete 82489DX. */
- processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ processor.mpc_apicver = 0;
processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) |
- boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_cpufeature = 0;
+ processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
for (i = 0; i < 2; i++) {
@@ -470,14 +422,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
case 5:
memcpy(bus.mpc_bustype, "ISA ", 6);
break;
- case 2:
- case 6:
- case 3:
- memcpy(bus.mpc_bustype, "EISA ", 6);
- break;
- case 4:
- case 7:
- memcpy(bus.mpc_bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -488,7 +432,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
- ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+ ioapic.mpc_apicver = 0;
ioapic.mpc_flags = MPC_APIC_USABLE;
ioapic.mpc_apicaddr = 0xFEC00000;
MP_ioapic_info(&ioapic);
@@ -531,13 +475,6 @@ void __init get_smp_config (void)
printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
- if (mpf->mpf_feature2 & (1<<7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
- pic_mode = 1;
- } else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
- pic_mode = 0;
- }
/*
* Now see if we need to read further.
@@ -617,7 +554,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
return 0;
}
-void __init find_intel_smp (void)
+void __init find_smp_config(void)
{
unsigned int address;
@@ -634,9 +571,7 @@ void __init find_intel_smp (void)
smp_scan_config(0xF0000,0x10000))
return;
/*
- * If it is an SMP machine we should know now, unless the
- * configuration is in an EISA/MCA bus machine with an
- * extended bios data area.
+ * If it is an SMP machine we should know now.
*
* there is a real-mode segmented pointer pointing to the
* 4K EBDA area at 0x40E, calculate and scan it here.
@@ -657,69 +592,41 @@ void __init find_intel_smp (void)
printk(KERN_INFO "No mptable found.\n");
}
-/*
- * - Intel MP Configuration Table
- */
-void __init find_smp_config (void)
-{
-#ifdef CONFIG_X86_LOCAL_APIC
- find_intel_smp();
-#endif
-}
-
-
/* --------------------------------------------------------------------------
ACPI-based MP Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
-void __init mp_register_lapic_address (
- u64 address)
+void __init mp_register_lapic_address(u64 address)
{
mp_lapic_addr = (unsigned long) address;
-
set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
if (boot_cpu_id == -1U)
boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-
- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
}
-
-void __cpuinit mp_register_lapic (
- u8 id,
- u8 enabled)
+void __cpuinit mp_register_lapic (u8 id, u8 enabled)
{
struct mpc_config_processor processor;
int boot_cpu = 0;
- if (id >= MAX_APICS) {
- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
- return;
- }
-
- if (id == boot_cpu_physical_apicid)
+ if (id == boot_cpu_id)
boot_cpu = 1;
processor.mpc_type = MP_PROCESSOR;
processor.mpc_apicid = id;
- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+ processor.mpc_apicver = 0;
processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+ processor.mpc_cpufeature = 0;
+ processor.mpc_featureflag = 0;
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
MP_processor_info(&processor);
}
-#ifdef CONFIG_X86_IO_APIC
-
#define MP_ISA_BUS 0
#define MP_MAX_IOAPIC_PIN 127
@@ -730,11 +637,9 @@ static struct mp_ioapic_routing {
u32 pin_programmed[4];
} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (
- int gsi)
+static int mp_find_ioapic(int gsi)
{
- int i = 0;
+ int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
@@ -744,28 +649,15 @@ static int mp_find_ioapic (
}
printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
return -1;
}
-
-void __init mp_register_ioapic (
- u8 id,
- u32 address,
- u32 gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
- int idx = 0;
+ int idx = 0;
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in MADT table, skipping!\n");
+ if (bad_ioapic(address))
return;
- }
idx = nr_ioapics++;
@@ -775,7 +667,7 @@ void __init mp_register_ioapic (
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
mp_ioapics[idx].mpc_apicid = id;
- mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+ mp_ioapics[idx].mpc_apicver = 0;
/*
* Build basic IRQ lookup table to facilitate gsi->io_apic lookups
@@ -786,21 +678,15 @@ void __init mp_register_ioapic (
mp_ioapic_routing[idx].gsi_end = gsi_base +
io_apic_get_redir_entries(idx);
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+ mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_start,
mp_ioapic_routing[idx].gsi_end);
-
- return;
}
-
-void __init mp_override_legacy_irq (
- u8 bus_irq,
- u8 polarity,
- u8 trigger,
- u32 gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
int ioapic = -1;
@@ -838,22 +724,18 @@ void __init mp_override_legacy_irq (
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
-
- return;
}
-
-void __init mp_config_acpi_legacy_irqs (void)
+void __init mp_config_acpi_legacy_irqs(void)
{
struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+ int i = 0;
+ int ioapic = -1;
/*
* Fabricate the legacy ISA bus (bus #31).
*/
- mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
- Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
/*
* Locate the IOAPIC that manages the ISA IRQs (0-15).
@@ -906,24 +788,22 @@ void __init mp_config_acpi_legacy_irqs (void)
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
}
-
- return;
}
#define MAX_GSI_NUM 4096
int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
- static int pci_irq = 16;
+ int ioapic = -1;
+ int ioapic_pin = 0;
+ int idx, bit = 0;
+ static int pci_irq = 16;
/*
* Mapping between Global System Interrupts, which
* represent all possible interrupts, to the IRQs
* assigned to actual devices.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
+ static int gsi_to_irq[MAX_GSI_NUM];
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
@@ -997,6 +877,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
return gsi;
}
-
-#endif /*CONFIG_X86_IO_APIC*/
#endif /*CONFIG_ACPI*/
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4e6357fe0ec3..7af9cb3e2d99 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -12,14 +12,9 @@
* Mikael Pettersson : PM converted to driver model. Disable/enable API.
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/nmi.h>
@@ -27,77 +22,148 @@
#include <linux/kprobes.h>
#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
#include <asm/nmi.h>
-#include <asm/msr.h>
#include <asm/proto.h>
#include <asm/kdebug.h>
-#include <asm/local.h>
#include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- * the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
+int panic_on_unrecovered_nmi;
+
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
+ */
+static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner);
+static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]);
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
*/
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG (1<<0)
-#define LAPIC_NMI_RESERVED (1<<1)
+#define NMI_MAX_COUNTER_BITS 66
/* nmi_active:
- * +1: the lapic NMI watchdog is active, but can be disabled
- * 0: the lapic NMI watchdog has not been set up, and cannot
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
- * -1: the lapic NMI watchdog is disabled, but can be enabled
+ * 0: the lapic NMI watchdog is disabled, but can be enabled
*/
-int nmi_active; /* oprofile uses this */
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
int panic_on_timeout;
unsigned int nmi_watchdog = NMI_DEFAULT;
static unsigned int nmi_hz = HZ;
-static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
-static unsigned int nmi_p4_cccr_val;
-/* Note that these events don't tick when the CPU idles. This means
- the frequency varies with CPU load. */
+struct nmi_watchdog_ctlblk {
+ int enabled;
+ u64 check_bit;
+ unsigned int cccr_msr;
+ unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
+ unsigned int evntsel_msr; /* the MSR to select the events to handle */
+};
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
-#define MSR_P4_MISC_ENABLE 0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
-#define MSR_P4_PERFCTR0 0x300
-#define MSR_P4_CCCR0 0x360
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0 0x30C
-#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
- P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_PERFCTR0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+ else
+ return (msr - MSR_P4_BPU_PERFCTR0);
+ }
+ return 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_EVNTSEL0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+ else
+ return (msr - MSR_P4_BSU_ESCR0);
+ }
+ return 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner));
+}
static __cpuinit inline int nmi_known_cpu(void)
{
@@ -105,13 +171,16 @@ static __cpuinit inline int nmi_known_cpu(void)
case X86_VENDOR_AMD:
return boot_cpu_data.x86 == 15;
case X86_VENDOR_INTEL:
- return boot_cpu_data.x86 == 15;
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return 1;
+ else
+ return (boot_cpu_data.x86 == 15);
}
return 0;
}
/* Run after command line and cpu_init init, but before all other checks */
-void __cpuinit nmi_watchdog_default(void)
+void nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
@@ -129,7 +198,7 @@ void __cpuinit nmi_watchdog_default(void)
static __init void nmi_cpu_busy(void *data)
{
volatile int *endflag = data;
- local_irq_enable();
+ local_irq_enable_in_hardirq();
/* Intentionally don't use cpu_relax here. This is
to make sure that the performance counter really ticks,
even if there is a simulator or similar that catches the
@@ -147,6 +216,12 @@ int __init check_nmi_watchdog (void)
int *counts;
int cpu;
+ if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
+ return 0;
+
+ if (!atomic_read(&nmi_active))
+ return 0;
+
counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!counts)
return -1;
@@ -164,26 +239,43 @@ int __init check_nmi_watchdog (void)
mdelay((10*1000)/nmi_hz); // wait 10 ticks
for_each_online_cpu(cpu) {
+ if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+ continue;
if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
- endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
counts[cpu],
cpu_pda(cpu)->__nmi_count);
- nmi_active = 0;
- lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
- nmi_perfctr_msr = 0;
- kfree(counts);
- return -1;
+ per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+ atomic_dec(&nmi_active);
}
}
+ if (!atomic_read(&nmi_active)) {
+ kfree(counts);
+ atomic_set(&nmi_active, -1);
+ return -1;
+ }
endflag = 1;
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
nmi_hz = 1;
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
+ ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
+ nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1;
+ }
+ }
kfree(counts);
return 0;
@@ -203,8 +295,11 @@ int __init setup_nmi_watchdog(char *str)
get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
+ if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
+
+ if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
+ return 0; /* no lapic support */
nmi_watchdog = nmi;
return 1;
}
@@ -213,77 +308,52 @@ __setup("nmi_watchdog=", setup_nmi_watchdog);
static void disable_lapic_nmi_watchdog(void)
{
- if (nmi_active <= 0)
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- wrmsr(MSR_K7_EVNTSEL0, 0, 0);
- break;
- case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 == 15) {
- wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
- wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
- }
- break;
- }
- nmi_active = -1;
- /* tell do_nmi() and others that we're not active any more */
- nmi_watchdog = 0;
-}
-static void enable_lapic_nmi_watchdog(void)
-{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_LOCAL_APIC;
- touch_nmi_watchdog();
- setup_apic_nmi_watchdog();
- }
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
-int reserve_lapic_nmi(void)
+static void enable_lapic_nmi_watchdog(void)
{
- unsigned int old_owner;
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
- spin_lock(&lapic_nmi_owner_lock);
- old_owner = lapic_nmi_owner;
- lapic_nmi_owner |= LAPIC_NMI_RESERVED;
- spin_unlock(&lapic_nmi_owner_lock);
- if (old_owner & LAPIC_NMI_RESERVED)
- return -EBUSY;
- if (old_owner & LAPIC_NMI_WATCHDOG)
- disable_lapic_nmi_watchdog();
- return 0;
-}
+ /* are we already enabled */
+ if (atomic_read(&nmi_active) != 0)
+ return;
-void release_lapic_nmi(void)
-{
- unsigned int new_owner;
+ /* are we lapic aware */
+ if (nmi_known_cpu() <= 0)
+ return;
- spin_lock(&lapic_nmi_owner_lock);
- new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
- lapic_nmi_owner = new_owner;
- spin_unlock(&lapic_nmi_owner_lock);
- if (new_owner & LAPIC_NMI_WATCHDOG)
- enable_lapic_nmi_watchdog();
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ touch_nmi_watchdog();
}
void disable_timer_nmi_watchdog(void)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
disable_irq(0);
- unset_nmi_callback();
- nmi_active = -1;
- nmi_watchdog = NMI_NONE;
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
void enable_timer_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_IO_APIC;
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) == 0) {
touch_nmi_watchdog();
- nmi_active = 1;
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
enable_irq(0);
}
}
@@ -294,15 +364,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
- nmi_pm_active = nmi_active;
- disable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ nmi_pm_active = atomic_read(&nmi_active);
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}
@@ -321,7 +396,13 @@ static int __init init_lapic_nmi_sysfs(void)
{
int error;
- if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+ /* should really be a BUG_ON but b/c this is an
+ * init call, it just doesn't work. -dcz
+ */
+ if (nmi_watchdog != NMI_LOCAL_APIC)
+ return 0;
+
+ if ( atomic_read(&nmi_active) < 0 )
return 0;
error = sysdev_class_register(&nmi_sysclass);
@@ -339,106 +420,332 @@ late_initcall(init_lapic_nmi_sysfs);
* Original code written by Keith Owens.
*/
-static void clear_msr_range(unsigned int base, unsigned int n)
-{
- unsigned int i;
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
- for(i = 0; i < n; ++i)
- wrmsr(base+i, 0, 0);
-}
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-static void setup_k7_watchdog(void)
+static int setup_k7_watchdog(void)
{
- int i;
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ perfctr_msr = MSR_K7_PERFCTR0;
+ evntsel_msr = MSR_K7_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- for(i = 0; i < 4; ++i) {
- /* Simulator may not support it */
- if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) {
- nmi_perfctr_msr = 0;
- return;
- }
- wrmsrl(MSR_K7_PERFCTR0+i, 0UL);
- }
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ /* Simulator may not support it */
+ if (checking_wrmsrl(evntsel_msr, 0UL))
+ goto fail2;
+ wrmsrl(perfctr_msr, 0UL);
evntsel = K7_EVNTSEL_INT
| K7_EVNTSEL_OS
| K7_EVNTSEL_USR
| K7_NMI_EVENT;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz));
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<63;
+ return 1;
+fail2:
+ release_evntsel_nmi(evntsel_msr);
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
+static void stop_k7_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+#define P4_CCCR_OVF (1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
static int setup_p4_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+ unsigned int evntsel, cccr_val;
unsigned int misc_enable, dummy;
+ unsigned int ht_num;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;
- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
- nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2)
- nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+ /* detect which hyperthread we are on */
+ if (smp_num_siblings == 2) {
+ unsigned int ebx, apicid;
+
+ ebx = cpuid_ebx(1);
+ apicid = (ebx >> 24) & 0xff;
+ ht_num = apicid & 1;
+ } else
#endif
-
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
- clear_msr_range(0x3F1, 2);
- /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
- docs doesn't fully define it, so leave it alone for now. */
- if (boot_cpu_data.x86_model >= 0x3) {
- /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
- clear_msr_range(0x3A0, 26);
- clear_msr_range(0x3BC, 3);
+ ht_num = 0;
+
+ /* performance counters are shared resources
+ * assign each hyperthread its own set
+ * (re-use the ESCR0 register, seems safe
+ * and keeps the cccr_val the same)
+ */
+ if (!ht_num) {
+ /* logical cpu 0 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR0;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR0;
+ cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
} else {
- clear_msr_range(0x3A0, 31);
+ /* logical cpu 1 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR1;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR1;
+ cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
}
- clear_msr_range(0x3C0, 6);
- clear_msr_range(0x3C8, 6);
- clear_msr_range(0x3E0, 2);
- clear_msr_range(MSR_P4_CCCR0, 18);
- clear_msr_range(MSR_P4_PERFCTR0, 18);
-
- wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz));
- wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz));
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+ | P4_ESCR_OS
+ | P4_ESCR_USR;
+
+ cccr_val |= P4_CCCR_THRESHOLD(15)
+ | P4_CCCR_COMPLEMENT
+ | P4_CCCR_COMPARE
+ | P4_CCCR_REQUIRED;
+
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsr(cccr_msr, cccr_val, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
+ wd->check_bit = 1ULL<<39;
return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-void setup_apic_nmi_watchdog(void)
+static void stop_p4_watchdog(void)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 15)
- return;
- if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
- return;
- setup_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- if (boot_cpu_data.x86 != 15)
- return;
- if (!setup_p4_watchdog())
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->cccr_msr, 0, 0);
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
+static int setup_intel_arch_watchdog(void)
+{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int perfctr_msr, evntsel_msr;
+ unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ goto fail;
+
+ perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
+
+ evntsel = ARCH_PERFMON_EVENTSEL_INT
+ | ARCH_PERFMON_EVENTSEL_OS
+ | ARCH_PERFMON_EVENTSEL_USR
+ | ARCH_PERFMON_NMI_EVENT_SEL
+ | ARCH_PERFMON_NMI_EVENT_UMASK;
+
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL << (eax.split.bit_width - 1);
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
+}
+
+static void stop_intel_arch_watchdog(void)
+{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ return;
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+void setup_apic_nmi_watchdog(void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 1)
+ return;
+
+ /* cheap hack to support suspend/resume */
+ /* if cpu0 is not active neither should the other cpus */
+ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+ return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
+ return;
+ if (!setup_k7_watchdog())
+ return;
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ if (!setup_intel_arch_watchdog())
+ return;
+ break;
+ }
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
return;
- break;
+ }
+ }
+ wd->enabled = 1;
+ atomic_inc(&nmi_active);
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
- default:
+ if (wd->enabled == 0)
return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (strstr(boot_cpu_data.x86_model_id, "Screwdriver"))
+ return;
+ stop_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ stop_intel_arch_watchdog();
+ break;
+ }
+ stop_p4_watchdog();
+ break;
+ default:
+ return;
+ }
}
- lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
- nmi_active = 1;
+ wd->enabled = 0;
+ atomic_dec(&nmi_active);
}
/*
@@ -471,83 +778,108 @@ void touch_nmi_watchdog (void)
touch_softlockup_watchdog();
}
-void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
int sum;
int touched = 0;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ u64 dummy;
+ int rc=0;
+
+ /* check for other users first */
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP) {
+ rc = 1;
+ touched = 1;
+ }
sum = read_pda(apic_timer_irqs);
if (__get_cpu_var(nmi_touch)) {
__get_cpu_var(nmi_touch) = 0;
touched = 1;
}
+
#ifdef CONFIG_X86_MCE
/* Could check oops_in_progress here too, but it's safer
not too */
if (atomic_read(&mce_entry) > 0)
touched = 1;
#endif
+ /* if the apic timer isn't firing, this cpu isn't doing much */
if (!touched && __get_cpu_var(last_irq_sum) == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
*/
local_inc(&__get_cpu_var(alert_counter));
- if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- local_set(&__get_cpu_var(alert_counter), 0);
- return;
- }
- die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs);
- }
+ if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz)
+ die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs,
+ panic_on_timeout);
} else {
__get_cpu_var(last_irq_sum) = sum;
local_set(&__get_cpu_var(alert_counter), 0);
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+
+ /* see if the nmi watchdog went off */
+ if (wd->enabled) {
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ rdmsrl(wd->perfctr_msr, dummy);
+ if (dummy & wd->check_bit){
+ /* this wasn't a watchdog timer interrupt */
+ goto done;
+ }
+
+ /* only Intel uses the cccr msr */
+ if (wd->cccr_msr != 0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ rdmsrl(wd->cccr_msr, dummy);
+ dummy &= ~P4_CCCR_OVF;
+ wrmsrl(wd->cccr_msr, dummy);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+ /*
+ * ArchPerfom/Core Duo needs to re-unmask
+ * the apic vector
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ /* start the cycle over again */
+ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));
+ rc = 1;
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ /* don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
+ */
+ rc = 1;
+ } else
+ printk(KERN_WARNING "Unknown enabled NMI hardware?!\n");
}
+done:
+ return rc;
}
-static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
- int cpu = safe_smp_processor_id();
-
nmi_enter();
add_pda(__nmi_count,1);
- if (!rcu_dereference(nmi_callback)(regs, cpu))
- default_do_nmi(regs);
+ default_do_nmi(regs);
nmi_exit();
}
-void set_nmi_callback(nmi_callback_t callback)
+int do_nmi_callback(struct pt_regs * regs, int cpu)
{
- vmalloc_sync_all();
- rcu_assign_pointer(nmi_callback, callback);
-}
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
+#ifdef CONFIG_SYSCTL
+ if (unknown_nmi_panic)
+ return unknown_nmi_panic_callback(regs, cpu);
+#endif
+ return 0;
}
#ifdef CONFIG_SYSCTL
@@ -557,36 +889,42 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
unsigned char reason = get_nmi_reason();
char buf[64];
- if (!(reason & 0xc0)) {
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf,regs);
- }
+ sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+ die_nmi(buf, regs, 1); /* Always panic here */
return 0;
}
/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ * proc handler for /proc/sys/kernel/nmi
*/
-int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
- old_state = unknown_nmi_panic;
+ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+ old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!unknown_nmi_panic)
+ if (!!old_state == !!nmi_watchdog_enabled)
return 0;
- if (unknown_nmi_panic) {
- if (reserve_lapic_nmi() < 0) {
- unknown_nmi_panic = 0;
- return -EBUSY;
- } else {
- set_nmi_callback(unknown_nmi_panic_callback);
- }
+ if (atomic_read(&nmi_active) < 0) {
+ printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+ return -EIO;
+ }
+
+ /* if nmi_watchdog is not set yet, then set it */
+ nmi_watchdog_default();
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_lapic_nmi_watchdog();
+ else
+ disable_lapic_nmi_watchdog();
} else {
- release_lapic_nmi();
- unset_nmi_callback();
+ printk( KERN_WARNING
+ "NMI watchdog doesn't know what hardware to touch\n");
+ return -EIO;
}
return 0;
}
@@ -595,8 +933,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
EXPORT_SYMBOL(disable_timer_nmi_watchdog);
EXPORT_SYMBOL(enable_timer_nmi_watchdog);
EXPORT_SYMBOL(touch_nmi_watchdog);
diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c
new file mode 100644
index 000000000000..cfb09b07ae99
--- /dev/null
+++ b/arch/x86_64/kernel/pci-calgary.c
@@ -0,0 +1,1069 @@
+/*
+ * Derived from arch/powerpc/kernel/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/pci_ids.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <asm/proto.h>
+#include <asm/calgary.h>
+#include <asm/tce.h>
+#include <asm/pci-direct.h>
+#include <asm/system.h>
+#include <asm/dma.h>
+
+#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1
+#define PCI_VENDOR_DEVICE_ID_CALGARY \
+ (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16)
+
+/* we need these for register space address calculation */
+#define START_ADDRESS 0xfe000000
+#define CHASSIS_BASE 0
+#define ONE_BASED_CHASSIS_NUM 1
+
+/* register offsets inside the host bridge space */
+#define PHB_CSR_OFFSET 0x0110
+#define PHB_PLSSR_OFFSET 0x0120
+#define PHB_CONFIG_RW_OFFSET 0x0160
+#define PHB_IOBASE_BAR_LOW 0x0170
+#define PHB_IOBASE_BAR_HIGH 0x0180
+#define PHB_MEM_1_LOW 0x0190
+#define PHB_MEM_1_HIGH 0x01A0
+#define PHB_IO_ADDR_SIZE 0x01B0
+#define PHB_MEM_1_SIZE 0x01C0
+#define PHB_MEM_ST_OFFSET 0x01D0
+#define PHB_AER_OFFSET 0x0200
+#define PHB_CONFIG_0_HIGH 0x0220
+#define PHB_CONFIG_0_LOW 0x0230
+#define PHB_CONFIG_0_END 0x0240
+#define PHB_MEM_2_LOW 0x02B0
+#define PHB_MEM_2_HIGH 0x02C0
+#define PHB_MEM_2_SIZE_HIGH 0x02D0
+#define PHB_MEM_2_SIZE_LOW 0x02E0
+#define PHB_DOSHOLE_OFFSET 0x08E0
+
+/* PHB_CONFIG_RW */
+#define PHB_TCE_ENABLE 0x20000000
+#define PHB_SLOT_DISABLE 0x1C000000
+#define PHB_DAC_DISABLE 0x01000000
+#define PHB_MEM2_ENABLE 0x00400000
+#define PHB_MCSR_ENABLE 0x00100000
+/* TAR (Table Address Register) */
+#define TAR_SW_BITS 0x0000ffffffff800fUL
+#define TAR_VALID 0x0000000000000008UL
+/* CSR (Channel/DMA Status Register) */
+#define CSR_AGENT_MASK 0xffe0ffff
+
+#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
+#define MAX_NUM_CHASSIS 8 /* max number of chassis */
+/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
+#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
+#define PHBS_PER_CALGARY 4
+
+/* register offsets in Calgary's internal register space */
+static const unsigned long tar_offsets[] = {
+ 0x0580 /* TAR0 */,
+ 0x0588 /* TAR1 */,
+ 0x0590 /* TAR2 */,
+ 0x0598 /* TAR3 */
+};
+
+static const unsigned long split_queue_offsets[] = {
+ 0x4870 /* SPLIT QUEUE 0 */,
+ 0x5870 /* SPLIT QUEUE 1 */,
+ 0x6870 /* SPLIT QUEUE 2 */,
+ 0x7870 /* SPLIT QUEUE 3 */
+};
+
+static const unsigned long phb_offsets[] = {
+ 0x8000 /* PHB0 */,
+ 0x9000 /* PHB1 */,
+ 0xA000 /* PHB2 */,
+ 0xB000 /* PHB3 */
+};
+
+unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED;
+static int translate_empty_slots __read_mostly = 0;
+static int calgary_detected __read_mostly = 0;
+
+struct calgary_bus_info {
+ void *tce_space;
+ unsigned char translation_disabled;
+ signed char phbid;
+};
+
+static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
+
+static void tce_cache_blast(struct iommu_table *tbl);
+
+/* enable this to stress test the chip's TCE cache */
+#ifdef CONFIG_IOMMU_DEBUG
+int debugging __read_mostly = 1;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+ int expected, unsigned long start, unsigned long end)
+{
+ unsigned long idx = start;
+
+ BUG_ON(start >= end);
+
+ while (idx < end) {
+ if (!!test_bit(idx, bitmap) != expected)
+ return idx;
+ ++idx;
+ }
+
+ /* all bits have the expected value */
+ return ~0UL;
+}
+#else /* debugging is disabled */
+int debugging __read_mostly = 0;
+
+static inline unsigned long verify_bit_range(unsigned long* bitmap,
+ int expected, unsigned long start, unsigned long end)
+{
+ return ~0UL;
+}
+#endif /* CONFIG_IOMMU_DEBUG */
+
+static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
+{
+ unsigned int npages;
+
+ npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
+ npages >>= PAGE_SHIFT;
+
+ return npages;
+}
+
+static inline int translate_phb(struct pci_dev* dev)
+{
+ int disabled = bus_info[dev->bus->number].translation_disabled;
+ return !disabled;
+}
+
+static void iommu_range_reserve(struct iommu_table *tbl,
+ unsigned long start_addr, unsigned int npages)
+{
+ unsigned long index;
+ unsigned long end;
+ unsigned long badbit;
+
+ index = start_addr >> PAGE_SHIFT;
+
+ /* bail out if we're asked to reserve a region we don't cover */
+ if (index >= tbl->it_size)
+ return;
+
+ end = index + npages;
+ if (end > tbl->it_size) /* don't go off the table */
+ end = tbl->it_size;
+
+ badbit = verify_bit_range(tbl->it_map, 0, index, end);
+ if (badbit != ~0UL) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "Calgary: entry already allocated at "
+ "0x%lx tbl %p dma 0x%lx npages %u\n",
+ badbit, tbl, start_addr, npages);
+ }
+
+ set_bit_string(tbl->it_map, index, npages);
+}
+
+static unsigned long iommu_range_alloc(struct iommu_table *tbl,
+ unsigned int npages)
+{
+ unsigned long offset;
+
+ BUG_ON(npages == 0);
+
+ offset = find_next_zero_string(tbl->it_map, tbl->it_hint,
+ tbl->it_size, npages);
+ if (offset == ~0UL) {
+ tce_cache_blast(tbl);
+ offset = find_next_zero_string(tbl->it_map, 0,
+ tbl->it_size, npages);
+ if (offset == ~0UL) {
+ printk(KERN_WARNING "Calgary: IOMMU full.\n");
+ if (panic_on_overflow)
+ panic("Calgary: fix the allocator.\n");
+ else
+ return bad_dma_address;
+ }
+ }
+
+ set_bit_string(tbl->it_map, offset, npages);
+ tbl->it_hint = offset + npages;
+ BUG_ON(tbl->it_hint > tbl->it_size);
+
+ return offset;
+}
+
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr,
+ unsigned int npages, int direction)
+{
+ unsigned long entry, flags;
+ dma_addr_t ret = bad_dma_address;
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ entry = iommu_range_alloc(tbl, npages);
+
+ if (unlikely(entry == bad_dma_address))
+ goto error;
+
+ /* set the return dma address */
+ ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
+
+ /* put the TCEs in the HW table */
+ tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
+ direction);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+ return ret;
+
+error:
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+ printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
+ "iommu %p\n", npages, tbl);
+ return bad_dma_address;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long entry;
+ unsigned long badbit;
+
+ entry = dma_addr >> PAGE_SHIFT;
+
+ BUG_ON(entry + npages > tbl->it_size);
+
+ tce_free(tbl, entry, npages);
+
+ badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
+ if (badbit != ~0UL) {
+ if (printk_ratelimit())
+ printk(KERN_ERR "Calgary: bit is off at 0x%lx "
+ "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
+ badbit, tbl, dma_addr, entry, npages);
+ }
+
+ __clear_bit_string(tbl->it_map, entry, npages);
+}
+
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+ unsigned int npages)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ __iommu_free(tbl, dma_addr, npages);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static void __calgary_unmap_sg(struct iommu_table *tbl,
+ struct scatterlist *sglist, int nelems, int direction)
+{
+ while (nelems--) {
+ unsigned int npages;
+ dma_addr_t dma = sglist->dma_address;
+ unsigned int dmalen = sglist->dma_length;
+
+ if (dmalen == 0)
+ break;
+
+ npages = num_dma_pages(dma, dmalen);
+ __iommu_free(tbl, dma, npages);
+ sglist++;
+ }
+}
+
+void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist,
+ int nelems, int direction)
+{
+ unsigned long flags;
+ struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+ if (!translate_phb(to_pci_dev(dev)))
+ return;
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ __calgary_unmap_sg(tbl, sglist, nelems, direction);
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+}
+
+static int calgary_nontranslate_map_sg(struct device* dev,
+ struct scatterlist *sg, int nelems, int direction)
+{
+ int i;
+
+ for (i = 0; i < nelems; i++ ) {
+ struct scatterlist *s = &sg[i];
+ BUG_ON(!s->page);
+ s->dma_address = virt_to_bus(page_address(s->page) +s->offset);
+ s->dma_length = s->length;
+ }
+ return nelems;
+}
+
+int calgary_map_sg(struct device *dev, struct scatterlist *sg,
+ int nelems, int direction)
+{
+ struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+ unsigned long flags;
+ unsigned long vaddr;
+ unsigned int npages;
+ unsigned long entry;
+ int i;
+
+ if (!translate_phb(to_pci_dev(dev)))
+ return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
+
+ spin_lock_irqsave(&tbl->it_lock, flags);
+
+ for (i = 0; i < nelems; i++ ) {
+ struct scatterlist *s = &sg[i];
+ BUG_ON(!s->page);
+
+ vaddr = (unsigned long)page_address(s->page) + s->offset;
+ npages = num_dma_pages(vaddr, s->length);
+
+ entry = iommu_range_alloc(tbl, npages);
+ if (entry == bad_dma_address) {
+ /* makes sure unmap knows to stop */
+ s->dma_length = 0;
+ goto error;
+ }
+
+ s->dma_address = (entry << PAGE_SHIFT) | s->offset;
+
+ /* insert into HW table */
+ tce_build(tbl, entry, npages, vaddr & PAGE_MASK,
+ direction);
+
+ s->dma_length = s->length;
+ }
+
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+
+ return nelems;
+error:
+ __calgary_unmap_sg(tbl, sg, nelems, direction);
+ for (i = 0; i < nelems; i++) {
+ sg[i].dma_address = bad_dma_address;
+ sg[i].dma_length = 0;
+ }
+ spin_unlock_irqrestore(&tbl->it_lock, flags);
+ return 0;
+}
+
+dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+ size_t size, int direction)
+{
+ dma_addr_t dma_handle = bad_dma_address;
+ unsigned long uaddr;
+ unsigned int npages;
+ struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+ uaddr = (unsigned long)vaddr;
+ npages = num_dma_pages(uaddr, size);
+
+ if (translate_phb(to_pci_dev(dev)))
+ dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+ else
+ dma_handle = virt_to_bus(vaddr);
+
+ return dma_handle;
+}
+
+void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
+ size_t size, int direction)
+{
+ struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata;
+ unsigned int npages;
+
+ if (!translate_phb(to_pci_dev(dev)))
+ return;
+
+ npages = num_dma_pages(dma_handle, size);
+ iommu_free(tbl, dma_handle, npages);
+}
+
+void* calgary_alloc_coherent(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flag)
+{
+ void *ret = NULL;
+ dma_addr_t mapping;
+ unsigned int npages, order;
+ struct iommu_table *tbl;
+
+ tbl = to_pci_dev(dev)->bus->self->sysdata;
+
+ size = PAGE_ALIGN(size); /* size rounded up to full pages */
+ npages = size >> PAGE_SHIFT;
+ order = get_order(size);
+
+ /* alloc enough pages (and possibly more) */
+ ret = (void *)__get_free_pages(flag, order);
+ if (!ret)
+ goto error;
+ memset(ret, 0, size);
+
+ if (translate_phb(to_pci_dev(dev))) {
+ /* set up tces to cover the allocated range */
+ mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+ if (mapping == bad_dma_address)
+ goto free;
+
+ *dma_handle = mapping;
+ } else /* non translated slot */
+ *dma_handle = virt_to_bus(ret);
+
+ return ret;
+
+free:
+ free_pages((unsigned long)ret, get_order(size));
+ ret = NULL;
+error:
+ return ret;
+}
+
+static struct dma_mapping_ops calgary_dma_ops = {
+ .alloc_coherent = calgary_alloc_coherent,
+ .map_single = calgary_map_single,
+ .unmap_single = calgary_unmap_single,
+ .map_sg = calgary_map_sg,
+ .unmap_sg = calgary_unmap_sg,
+};
+
+static inline int busno_to_phbid(unsigned char num)
+{
+ return bus_info[num].phbid;
+}
+
+static inline unsigned long split_queue_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return split_queue_offsets[idx];
+}
+
+static inline unsigned long tar_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return tar_offsets[idx];
+}
+
+static inline unsigned long phb_offset(unsigned char num)
+{
+ size_t idx = busno_to_phbid(num);
+
+ return phb_offsets[idx];
+}
+
+static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset)
+{
+ unsigned long target = ((unsigned long)bar) | offset;
+ return (void __iomem*)target;
+}
+
+static void tce_cache_blast(struct iommu_table *tbl)
+{
+ u64 val;
+ u32 aer;
+ int i = 0;
+ void __iomem *bbar = tbl->bbar;
+ void __iomem *target;
+
+ /* disable arbitration on the bus */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ aer = readl(target);
+ writel(0, target);
+
+ /* read plssr to ensure it got there */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET);
+ val = readl(target);
+
+ /* poll split queues until all DMA activity is done */
+ target = calgary_reg(bbar, split_queue_offset(tbl->it_busno));
+ do {
+ val = readq(target);
+ i++;
+ } while ((val & 0xff) != 0xff && i < 100);
+ if (i == 100)
+ printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
+ "continuing anyway\n");
+
+ /* invalidate TCE cache */
+ target = calgary_reg(bbar, tar_offset(tbl->it_busno));
+ writeq(tbl->tar_val, target);
+
+ /* enable arbitration */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET);
+ writel(aer, target);
+ (void)readl(target); /* flush */
+}
+
+static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start,
+ u64 limit)
+{
+ unsigned int numpages;
+
+ limit = limit | 0xfffff;
+ limit++;
+
+ numpages = ((limit - start) >> PAGE_SHIFT);
+ iommu_range_reserve(dev->sysdata, start, numpages);
+}
+
+static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u64 low, high, sizelow;
+ u64 start, limit;
+ struct iommu_table *tbl = dev->sysdata;
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* peripheral MEM_1 region */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE);
+ sizelow = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev)
+{
+ void __iomem *target;
+ u32 val32;
+ u64 low, high, sizelow, sizehigh;
+ u64 start, limit;
+ struct iommu_table *tbl = dev->sysdata;
+ unsigned char busnum = dev->bus->number;
+ void __iomem *bbar = tbl->bbar;
+
+ /* is it enabled? */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ if (!(val32 & PHB_MEM2_ENABLE))
+ return;
+
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW);
+ low = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH);
+ high = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW);
+ sizelow = be32_to_cpu(readl(target));
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH);
+ sizehigh = be32_to_cpu(readl(target));
+
+ start = (high << 32) | low;
+ limit = (sizehigh << 32) | sizelow;
+
+ calgary_reserve_mem_region(dev, start, limit);
+}
+
+/*
+ * some regions of the IO address space do not get translated, so we
+ * must not give devices IO addresses in those regions. The regions
+ * are the 640KB-1MB region and the two PCI peripheral memory holes.
+ * Reserve all of them in the IOMMU bitmap to avoid giving them out
+ * later.
+ */
+static void __init calgary_reserve_regions(struct pci_dev *dev)
+{
+ unsigned int npages;
+ void __iomem *bbar;
+ unsigned char busnum;
+ u64 start;
+ struct iommu_table *tbl = dev->sysdata;
+
+ bbar = tbl->bbar;
+ busnum = dev->bus->number;
+
+ /* reserve bad_dma_address in case it's a legal address */
+ iommu_range_reserve(tbl, bad_dma_address, 1);
+
+ /* avoid the BIOS/VGA first 640KB-1MB region */
+ start = (640 * 1024);
+ npages = ((1024 - 640) * 1024) >> PAGE_SHIFT;
+ iommu_range_reserve(tbl, start, npages);
+
+ /* reserve the two PCI peripheral memory regions in IO space */
+ calgary_reserve_peripheral_mem_1(dev);
+ calgary_reserve_peripheral_mem_2(dev);
+}
+
+static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
+{
+ u64 val64;
+ u64 table_phys;
+ void __iomem *target;
+ int ret;
+ struct iommu_table *tbl;
+
+ /* build TCE tables for each PHB */
+ ret = build_tce_table(dev, bbar);
+ if (ret)
+ return ret;
+
+ tbl = dev->sysdata;
+ tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
+ tce_free(tbl, 0, tbl->it_size);
+
+ calgary_reserve_regions(dev);
+
+ /* set TARs for each PHB */
+ target = calgary_reg(bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+
+ /* zero out all TAR bits under sw control */
+ val64 &= ~TAR_SW_BITS;
+
+ tbl = dev->sysdata;
+ table_phys = (u64)__pa(tbl->it_base);
+ val64 |= table_phys;
+
+ BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M);
+ val64 |= (u64) specified_table_size;
+
+ tbl->tar_val = cpu_to_be64(val64);
+ writeq(tbl->tar_val, target);
+ readq(target); /* flush */
+
+ return 0;
+}
+
+static void __init calgary_free_bus(struct pci_dev *dev)
+{
+ u64 val64;
+ struct iommu_table *tbl = dev->sysdata;
+ void __iomem *target;
+ unsigned int bitmapsz;
+
+ target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number));
+ val64 = be64_to_cpu(readq(target));
+ val64 &= ~TAR_SW_BITS;
+ writeq(cpu_to_be64(val64), target);
+ readq(target); /* flush */
+
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ free_pages((unsigned long)tbl->it_map, get_order(bitmapsz));
+ tbl->it_map = NULL;
+
+ kfree(tbl);
+ dev->sysdata = NULL;
+
+ /* Can't free bootmem allocated memory after system is up :-( */
+ bus_info[dev->bus->number].tce_space = NULL;
+}
+
+static void calgary_watchdog(unsigned long data)
+{
+ struct pci_dev *dev = (struct pci_dev *)data;
+ struct iommu_table *tbl = dev->sysdata;
+ void __iomem *bbar = tbl->bbar;
+ u32 val32;
+ void __iomem *target;
+
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+
+ /* If no error, the agent ID in the CSR is not valid */
+ if (val32 & CSR_AGENT_MASK) {
+ printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, "
+ "CSR = %#x\n", dev->bus->number, val32);
+ writel(0, target);
+
+ /* Disable bus that caused the error */
+ target = calgary_reg(bbar, phb_offset(tbl->it_busno) |
+ PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_SLOT_DISABLE;
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+ } else {
+ /* Reset the timer */
+ mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ);
+ }
+}
+
+static void __init calgary_enable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = dev->sysdata;
+ bbar = tbl->bbar;
+
+ /* enable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE;
+
+ printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum);
+ printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this "
+ "bus.\n");
+
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ init_timer(&tbl->watchdog_timer);
+ tbl->watchdog_timer.function = &calgary_watchdog;
+ tbl->watchdog_timer.data = (unsigned long)dev;
+ mod_timer(&tbl->watchdog_timer, jiffies);
+}
+
+static void __init calgary_disable_translation(struct pci_dev *dev)
+{
+ u32 val32;
+ unsigned char busnum;
+ void __iomem *target;
+ void __iomem *bbar;
+ struct iommu_table *tbl;
+
+ busnum = dev->bus->number;
+ tbl = dev->sysdata;
+ bbar = tbl->bbar;
+
+ /* disable TCE in PHB Config Register */
+ target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET);
+ val32 = be32_to_cpu(readl(target));
+ val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE);
+
+ printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum);
+ writel(cpu_to_be32(val32), target);
+ readl(target); /* flush */
+
+ del_timer_sync(&tbl->watchdog_timer);
+}
+
+static inline unsigned int __init locate_register_space(struct pci_dev *dev)
+{
+ int rionodeid;
+ u32 address;
+
+ rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2;
+ /*
+ * register space address calculation as follows:
+ * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase)
+ * ChassisBase is always zero for x366/x260/x460
+ * RioNodeId is 2 for first Calgary, 3 for second Calgary
+ */
+ address = START_ADDRESS -
+ (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) +
+ (0x100000) * (rionodeid - CHASSIS_BASE);
+ return address;
+}
+
+static void __init calgary_init_one_nontraslated(struct pci_dev *dev)
+{
+ pci_dev_get(dev);
+ dev->sysdata = NULL;
+ dev->bus->self = dev;
+}
+
+static int __init calgary_init_one(struct pci_dev *dev)
+{
+ u32 address;
+ void __iomem *bbar;
+ int ret;
+
+ address = locate_register_space(dev);
+ /* map entire 1MB of Calgary config space */
+ bbar = ioremap_nocache(address, 1024 * 1024);
+ if (!bbar) {
+ ret = -ENODATA;
+ goto done;
+ }
+
+ ret = calgary_setup_tar(dev, bbar);
+ if (ret)
+ goto iounmap;
+
+ pci_dev_get(dev);
+ dev->bus->self = dev;
+ calgary_enable_translation(dev);
+
+ return 0;
+
+iounmap:
+ iounmap(bbar);
+done:
+ return ret;
+}
+
+static int __init calgary_init(void)
+{
+ int i, ret = -ENODEV;
+ struct pci_dev *dev = NULL;
+
+ for (i = 0; i < MAX_PHB_BUS_NUM; i++) {
+ dev = pci_get_device(PCI_VENDOR_ID_IBM,
+ PCI_DEVICE_ID_IBM_CALGARY,
+ dev);
+ if (!dev)
+ break;
+ if (!translate_phb(dev)) {
+ calgary_init_one_nontraslated(dev);
+ continue;
+ }
+ if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
+ continue;
+
+ ret = calgary_init_one(dev);
+ if (ret)
+ goto error;
+ }
+
+ return ret;
+
+error:
+ for (i--; i >= 0; i--) {
+ dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM,
+ PCI_DEVICE_ID_IBM_CALGARY,
+ dev);
+ if (!dev)
+ break;
+ if (!translate_phb(dev)) {
+ pci_dev_put(dev);
+ continue;
+ }
+ if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots)
+ continue;
+
+ calgary_disable_translation(dev);
+ calgary_free_bus(dev);
+ pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+ }
+
+ return ret;
+}
+
+static inline int __init determine_tce_table_size(u64 ram)
+{
+ int ret;
+
+ if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED)
+ return specified_table_size;
+
+ /*
+ * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to
+ * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each
+ * larger table size has twice as many entries, so shift the
+ * max ram address by 13 to divide by 8K and then look at the
+ * order of the result to choose between 0-7.
+ */
+ ret = get_order(ram >> 13);
+ if (ret > TCE_TABLE_SIZE_8M)
+ ret = TCE_TABLE_SIZE_8M;
+
+ return ret;
+}
+
+void __init detect_calgary(void)
+{
+ u32 val;
+ int bus;
+ void *tbl;
+ int calgary_found = 0;
+ int phb = -1;
+
+ /*
+ * if the user specified iommu=off or iommu=soft or we found
+ * another HW IOMMU already, bail out.
+ */
+ if (swiotlb || no_iommu || iommu_detected)
+ return;
+
+ if (!early_pci_allowed())
+ return;
+
+ specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE);
+
+ for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+ int dev;
+ struct calgary_bus_info *info = &bus_info[bus];
+ info->phbid = -1;
+
+ if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY)
+ continue;
+
+ /*
+ * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3)
+ * it is connected to releative to the clagary chip.
+ */
+ phb = (phb + 1) % PHBS_PER_CALGARY;
+
+ if (info->translation_disabled)
+ continue;
+
+ /*
+ * Scan the slots of the PCI bus to see if there is a device present.
+ * The parent bus will be the zero-ith device, so start at 1.
+ */
+ for (dev = 1; dev < 8; dev++) {
+ val = read_pci_config(bus, dev, 0, 0);
+ if (val != 0xffffffff || translate_empty_slots) {
+ tbl = alloc_tce_table();
+ if (!tbl)
+ goto cleanup;
+ info->tce_space = tbl;
+ info->phbid = phb;
+ calgary_found = 1;
+ break;
+ }
+ }
+ }
+
+ if (calgary_found) {
+ iommu_detected = 1;
+ calgary_detected = 1;
+ printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
+ printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
+ "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
+ debugging ? "enabled" : "disabled");
+ }
+ return;
+
+cleanup:
+ for (--bus; bus >= 0; --bus) {
+ struct calgary_bus_info *info = &bus_info[bus];
+
+ if (info->tce_space)
+ free_tce_table(info->tce_space);
+ }
+}
+
+int __init calgary_iommu_init(void)
+{
+ int ret;
+
+ if (no_iommu || swiotlb)
+ return -ENODEV;
+
+ if (!calgary_detected)
+ return -ENODEV;
+
+ /* ok, we're trying to use Calgary - let's roll */
+ printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
+
+ ret = calgary_init();
+ if (ret) {
+ printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
+ "falling back to no_iommu\n", ret);
+ if (end_pfn > MAX_DMA32_PFN)
+ printk(KERN_ERR "WARNING more than 4GB of memory, "
+ "32bit PCI may malfunction.\n");
+ return ret;
+ }
+
+ force_iommu = 1;
+ dma_ops = &calgary_dma_ops;
+
+ return 0;
+}
+
+static int __init calgary_parse_options(char *p)
+{
+ unsigned int bridge;
+ size_t len;
+ char* endp;
+
+ while (*p) {
+ if (!strncmp(p, "64k", 3))
+ specified_table_size = TCE_TABLE_SIZE_64K;
+ else if (!strncmp(p, "128k", 4))
+ specified_table_size = TCE_TABLE_SIZE_128K;
+ else if (!strncmp(p, "256k", 4))
+ specified_table_size = TCE_TABLE_SIZE_256K;
+ else if (!strncmp(p, "512k", 4))
+ specified_table_size = TCE_TABLE_SIZE_512K;
+ else if (!strncmp(p, "1M", 2))
+ specified_table_size = TCE_TABLE_SIZE_1M;
+ else if (!strncmp(p, "2M", 2))
+ specified_table_size = TCE_TABLE_SIZE_2M;
+ else if (!strncmp(p, "4M", 2))
+ specified_table_size = TCE_TABLE_SIZE_4M;
+ else if (!strncmp(p, "8M", 2))
+ specified_table_size = TCE_TABLE_SIZE_8M;
+
+ len = strlen("translate_empty_slots");
+ if (!strncmp(p, "translate_empty_slots", len))
+ translate_empty_slots = 1;
+
+ len = strlen("disable");
+ if (!strncmp(p, "disable", len)) {
+ p += len;
+ if (*p == '=')
+ ++p;
+ if (*p == '\0')
+ break;
+ bridge = simple_strtol(p, &endp, 0);
+ if (p == endp)
+ break;
+
+ if (bridge < MAX_PHB_BUS_NUM) {
+ printk(KERN_INFO "Calgary: disabling "
+ "translation for PHB 0x%x\n", bridge);
+ bus_info[bridge].translation_disabled = 1;
+ }
+ }
+
+ p = strpbrk(p, ",");
+ if (!p)
+ break;
+
+ p++; /* skip ',' */
+ }
+ return 1;
+}
+__setup("calgary=", calgary_parse_options);
diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c
index a9275c9557cf..f8d857453f8a 100644
--- a/arch/x86_64/kernel/pci-dma.c
+++ b/arch/x86_64/kernel/pci-dma.c
@@ -9,6 +9,7 @@
#include <linux/module.h>
#include <asm/io.h>
#include <asm/proto.h>
+#include <asm/calgary.h>
int iommu_merge __read_mostly = 0;
EXPORT_SYMBOL(iommu_merge);
@@ -33,12 +34,15 @@ int panic_on_overflow __read_mostly = 0;
int force_iommu __read_mostly= 0;
#endif
+/* Set this to 1 if there is a HW IOMMU in the system */
+int iommu_detected __read_mostly = 0;
+
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
to i386. */
struct device fallback_dev = {
.bus_id = "fallback device",
- .coherent_dma_mask = 0xffffffff,
+ .coherent_dma_mask = DMA_32BIT_MASK,
.dma_mask = &fallback_dev.coherent_dma_mask,
};
@@ -77,7 +81,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
dev = &fallback_dev;
dma_mask = dev->coherent_dma_mask;
if (dma_mask == 0)
- dma_mask = 0xffffffff;
+ dma_mask = DMA_32BIT_MASK;
/* Don't invoke OOM killer */
gfp |= __GFP_NORETRY;
@@ -90,7 +94,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
larger than 16MB and in this case we have a chance of
finding fitting memory in the next higher zone first. If
not retry with true GFP_DMA. -AK */
- if (dma_mask <= 0xffffffff)
+ if (dma_mask <= DMA_32BIT_MASK)
gfp |= GFP_DMA32;
again:
@@ -111,7 +115,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* Don't use the 16MB ZONE_DMA unless absolutely
needed. It's better to use remapping first. */
- if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) {
+ if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
@@ -166,15 +170,27 @@ void dma_free_coherent(struct device *dev, size_t size,
}
EXPORT_SYMBOL(dma_free_coherent);
+static int forbid_dac __read_mostly;
+
int dma_supported(struct device *dev, u64 mask)
{
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+
+
+
+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
+ return 0;
+ }
+#endif
+
if (dma_ops->dma_supported)
return dma_ops->dma_supported(dev, mask);
/* Copied from i386. Doesn't make much sense, because it will
only work for pci_alloc_coherent.
The caller just has to use GFP_DMA in this case. */
- if (mask < 0x00ffffff)
+ if (mask < DMA_24BIT_MASK)
return 0;
/* Tell the device to use SAC when IOMMU force is on. This
@@ -189,7 +205,7 @@ int dma_supported(struct device *dev, u64 mask)
SAC for these. Assume all masks <= 40 bits are of this
type. Normally this doesn't make any difference, but gives
more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= 0xffffffffffULL)) {
+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
return 0;
}
@@ -227,52 +243,99 @@ EXPORT_SYMBOL(dma_set_mask);
allowed overwrite iommu off workarounds for specific chipsets.
soft Use software bounce buffering (default for Intel machines)
noaperture Don't touch the aperture for AGP.
+ allowdac Allow DMA >4GB
+ nodac Forbid DMA >4GB
+ panic Force panic when IOMMU overflows
*/
__init int iommu_setup(char *p)
{
- iommu_merge = 1;
-
- while (*p) {
- if (!strncmp(p,"off",3))
- no_iommu = 1;
- /* gart_parse_options has more force support */
- if (!strncmp(p,"force",5))
- force_iommu = 1;
- if (!strncmp(p,"noforce",7)) {
- iommu_merge = 0;
- force_iommu = 0;
- }
-
- if (!strncmp(p, "biomerge",8)) {
- iommu_bio_merge = 4096;
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "panic",5))
- panic_on_overflow = 1;
- if (!strncmp(p, "nopanic",7))
- panic_on_overflow = 0;
- if (!strncmp(p, "merge",5)) {
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "nomerge",7))
- iommu_merge = 0;
- if (!strncmp(p, "forcesac",8))
- iommu_sac_force = 1;
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p,"off",3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p,"force",5))
+ force_iommu = 1;
+ if (!strncmp(p,"noforce",7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge",8)) {
+ iommu_bio_merge = 4096;
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic",5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic",7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge",5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge",7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac",8))
+ iommu_sac_force = 1;
+ if (!strncmp(p, "allowdac", 8))
+ forbid_dac = 0;
+ if (!strncmp(p, "nodac", 5))
+ forbid_dac = -1;
+
+#ifdef CONFIG_SWIOTLB
+ if (!strncmp(p, "soft",4))
+ swiotlb = 1;
+#endif
+
+#ifdef CONFIG_IOMMU
+ gart_parse_options(p);
+#endif
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+void __init pci_iommu_alloc(void)
+{
+ /*
+ * The order of these functions is important for
+ * fall-back/fail-over reasons
+ */
+#ifdef CONFIG_IOMMU
+ iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+ detect_calgary();
+#endif
#ifdef CONFIG_SWIOTLB
- if (!strncmp(p, "soft",4))
- swiotlb = 1;
+ pci_swiotlb_init();
+#endif
+}
+
+static int __init pci_iommu_init(void)
+{
+#ifdef CONFIG_CALGARY_IOMMU
+ calgary_iommu_init();
#endif
-#ifdef CONFIG_GART_IOMMU
- gart_parse_options(p);
+#ifdef CONFIG_IOMMU
+ gart_iommu_init();
#endif
- p += strcspn(p, ",");
- if (*p == ',')
- ++p;
- }
- return 1;
+ no_iommu_init();
+ return 0;
}
+
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c
index 82a7c9bfdfa0..16261a8a3303 100644
--- a/arch/x86_64/kernel/pci-gart.c
+++ b/arch/x86_64/kernel/pci-gart.c
@@ -10,7 +10,6 @@
* Copyright 2002 Andi Kleen, SuSE Labs.
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/ctype.h>
#include <linux/agp_backend.h>
@@ -32,6 +31,7 @@
#include <asm/kdebug.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
+#include <asm/k8.h>
unsigned long iommu_bus_base; /* GART remapping area (physical) */
static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,8 +46,6 @@ u32 *iommu_gatt_base; /* Remapping table */
also seen with Qlogic at least). */
int iommu_fullflush = 1;
-#define MAX_NB 8
-
/* Allocation bitmap for the remapping area */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */
@@ -63,13 +61,6 @@ static u32 gart_unmapped_entry;
#define to_pages(addr,size) \
(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
-#define for_all_nb(dev) \
- dev = NULL; \
- while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL)
-
-static struct pci_dev *northbridges[MAX_NB];
-static u32 northbridge_flush_word[MAX_NB];
-
#define EMERGENCY_PAGES 32 /* = 128KB */
#ifdef CONFIG_AGP
@@ -93,7 +84,7 @@ static unsigned long alloc_iommu(int size)
offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);
if (offset == -1) {
need_flush = 1;
- offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size);
+ offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);
}
if (offset != -1) {
set_bit_string(iommu_gart_bitmap, offset, size);
@@ -120,44 +111,17 @@ static void free_iommu(unsigned long offset, int size)
/*
* Use global flush state to avoid races with multiple flushers.
*/
-static void flush_gart(struct device *dev)
+static void flush_gart(void)
{
unsigned long flags;
- int flushed = 0;
- int i, max;
-
spin_lock_irqsave(&iommu_bitmap_lock, flags);
- if (need_flush) {
- max = 0;
- for (i = 0; i < MAX_NB; i++) {
- if (!northbridges[i])
- continue;
- pci_write_config_dword(northbridges[i], 0x9c,
- northbridge_flush_word[i] | 1);
- flushed++;
- max = i;
- }
- for (i = 0; i <= max; i++) {
- u32 w;
- if (!northbridges[i])
- continue;
- /* Make sure the hardware actually executed the flush. */
- for (;;) {
- pci_read_config_dword(northbridges[i], 0x9c, &w);
- if (!(w & 1))
- break;
- cpu_relax();
- }
- }
- if (!flushed)
- printk("nothing to flush?\n");
+ if (need_flush) {
+ k8_flush_garts();
need_flush = 0;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
}
-
-
#ifdef CONFIG_IOMMU_LEAK
#define SET_LEAK(x) if (iommu_leak_tab) \
@@ -266,7 +230,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf,
size_t size, int dir)
{
dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
- flush_gart(dev);
+ flush_gart();
return map;
}
@@ -275,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
{
unsigned long phys_mem, bus;
- BUG_ON(dir == DMA_NONE);
-
if (!dev)
dev = &fallback_dev;
@@ -289,6 +251,28 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)
}
/*
+ * Free a DMA mapping.
+ */
+void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
+ size_t size, int direction)
+{
+ unsigned long iommu_page;
+ int npages;
+ int i;
+
+ if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+ dma_addr >= iommu_bus_base + iommu_size)
+ return;
+ iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+ npages = to_pages(dma_addr, size);
+ for (i = 0; i < npages; i++) {
+ iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+ CLEAR_LEAK(iommu_page + i);
+ }
+ free_iommu(iommu_page, npages);
+}
+
+/*
* Wrapper for pci_unmap_single working with scatterlists.
*/
void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
@@ -299,7 +283,7 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di
struct scatterlist *s = &sg[i];
if (!s->dma_length || !s->length)
break;
- dma_unmap_single(dev, s->dma_address, s->dma_length, dir);
+ gart_unmap_single(dev, s->dma_address, s->dma_length, dir);
}
}
@@ -329,7 +313,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
s->dma_address = addr;
s->dma_length = s->length;
}
- flush_gart(dev);
+ flush_gart();
return nents;
}
@@ -397,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
unsigned long pages = 0;
int need = 0, nextneed;
- BUG_ON(dir == DMA_NONE);
if (nents == 0)
return 0;
@@ -436,13 +419,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)
goto error;
out++;
- flush_gart(dev);
+ flush_gart();
if (out < nents)
sg[out].dma_length = 0;
return out;
error:
- flush_gart(NULL);
+ flush_gart();
gart_unmap_sg(dev, sg, nents, dir);
/* When it was forced or merged try again in a dumb way */
if (force_iommu || iommu_merge) {
@@ -458,28 +441,6 @@ error:
return 0;
}
-/*
- * Free a DMA mapping.
- */
-void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
- size_t size, int direction)
-{
- unsigned long iommu_page;
- int npages;
- int i;
-
- if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
- dma_addr >= iommu_bus_base + iommu_size)
- return;
- iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
- npages = to_pages(dma_addr, size);
- for (i = 0; i < npages; i++) {
- iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
- CLEAR_LEAK(iommu_page + i);
- }
- free_iommu(iommu_page, npages);
-}
-
static int no_agp;
static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -532,10 +493,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
void *gatt;
unsigned aper_base, new_aper_base;
unsigned aper_size, gatt_size, new_aper_size;
-
+ int i;
+
printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
aper_size = aper_base = info->aper_size = 0;
- for_all_nb(dev) {
+ dev = NULL;
+ for (i = 0; i < num_k8_northbridges; i++) {
+ dev = k8_northbridges[i];
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -558,11 +522,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
panic("Cannot allocate GATT table");
memset(gatt, 0, gatt_size);
agp_gatt_table = gatt;
-
- for_all_nb(dev) {
+
+ for (i = 0; i < num_k8_northbridges; i++) {
u32 ctl;
u32 gatt_reg;
+ dev = k8_northbridges[i];
gatt_reg = __pa(gatt) >> 12;
gatt_reg <<= 4;
pci_write_config_dword(dev, 0x98, gatt_reg);
@@ -573,7 +538,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
pci_write_config_dword(dev, 0x90, ctl);
}
- flush_gart(NULL);
+ flush_gart();
printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);
return 0;
@@ -602,15 +567,19 @@ static struct dma_mapping_ops gart_dma_ops = {
.unmap_sg = gart_unmap_sg,
};
-static int __init pci_iommu_init(void)
+void __init gart_iommu_init(void)
{
struct agp_kern_info info;
unsigned long aper_size;
unsigned long iommu_start;
- struct pci_dev *dev;
unsigned long scratch;
long i;
+ if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) {
+ printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n");
+ return;
+ }
+
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
@@ -622,7 +591,11 @@ static int __init pci_iommu_init(void)
#endif
if (swiotlb)
- return -1;
+ return;
+
+ /* Did we detect a different HW IOMMU? */
+ if (iommu_detected && !iommu_aperture)
+ return;
if (no_iommu ||
(!force_iommu && end_pfn <= MAX_DMA32_PFN) ||
@@ -634,15 +607,7 @@ static int __init pci_iommu_init(void)
"but IOMMU not available.\n"
KERN_ERR "WARNING 32bit PCI may malfunction.\n");
}
- return -1;
- }
-
- i = 0;
- for_all_nb(dev)
- i++;
- if (i > MAX_NB) {
- printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i);
- return -1;
+ return;
}
printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
@@ -707,26 +672,10 @@ static int __init pci_iommu_init(void)
for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
iommu_gatt_base[i] = gart_unmapped_entry;
- for_all_nb(dev) {
- u32 flag;
- int cpu = PCI_SLOT(dev->devfn) - 24;
- if (cpu >= MAX_NB)
- continue;
- northbridges[cpu] = dev;
- pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */
- northbridge_flush_word[cpu] = flag;
- }
-
- flush_gart(NULL);
-
+ flush_gart();
dma_ops = &gart_dma_ops;
-
- return 0;
}
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
-
void gart_parse_options(char *p)
{
int arg;
diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c
index 1f6ecc62061d..df09ab05a1bd 100644
--- a/arch/x86_64/kernel/pci-nommu.c
+++ b/arch/x86_64/kernel/pci-nommu.c
@@ -4,6 +4,8 @@
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/string.h>
+#include <linux/dma-mapping.h>
+
#include <asm/proto.h>
#include <asm/processor.h>
#include <asm/dma.h>
@@ -12,10 +14,11 @@ static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
if (hwdev && bus + size > *hwdev->dma_mask) {
- if (*hwdev->dma_mask >= 0xffffffffULL)
+ if (*hwdev->dma_mask >= DMA_32BIT_MASK)
printk(KERN_ERR
- "nommu_%s: overflow %Lx+%lu of device mask %Lx\n",
- name, (long long)bus, size, (long long)*hwdev->dma_mask);
+ "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
+ name, (long long)bus, size,
+ (long long)*hwdev->dma_mask);
return 0;
}
return 1;
@@ -56,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
{
int i;
- BUG_ON(direction == DMA_NONE);
for (i = 0; i < nents; i++ ) {
struct scatterlist *s = &sg[i];
BUG_ON(!s->page);
@@ -89,5 +91,7 @@ void __init no_iommu_init(void)
{
if (dma_ops)
return;
+
+ force_iommu = 0; /* no HW IOMMU */
dma_ops = &nommu_dma_ops;
}
diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c
index 990ed67896f2..697f0aa794b9 100644
--- a/arch/x86_64/kernel/pci-swiotlb.c
+++ b/arch/x86_64/kernel/pci-swiotlb.c
@@ -3,7 +3,8 @@
#include <linux/pci.h>
#include <linux/cache.h>
#include <linux/module.h>
-#include <asm/dma-mapping.h>
+#include <linux/dma-mapping.h>
+
#include <asm/proto.h>
#include <asm/swiotlb.h>
#include <asm/dma.h>
@@ -31,9 +32,10 @@ struct dma_mapping_ops swiotlb_dma_ops = {
void pci_swiotlb_init(void)
{
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
- if (!iommu_aperture && !no_iommu &&
- (end_pfn > MAX_DMA32_PFN || force_iommu))
+ if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN)
swiotlb = 1;
+ if (swiotlb_force)
+ swiotlb = 1;
if (swiotlb) {
printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n");
swiotlb_init();
diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c
index bf421ed26808..7554458dc9cb 100644
--- a/arch/x86_64/kernel/pmtimer.c
+++ b/arch/x86_64/kernel/pmtimer.c
@@ -27,7 +27,7 @@
/* The I/O port the PMTMR resides at.
* The location is detected during setup_arch(),
* in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport;
+u32 pmtmr_ioport __read_mostly;
/* value of the Power timer at last timer interrupt */
static u32 offset_delay;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index fb903e65e079..458006ae19f3 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -10,7 +10,6 @@
* Andi Kleen.
*
* CPU hotplug support - ashok.raj@intel.com
- * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
*/
/*
@@ -64,6 +63,7 @@ EXPORT_SYMBOL(boot_option_idle_override);
* Powermanagement idle function, if any..
*/
void (*pm_idle)(void);
+EXPORT_SYMBOL(pm_idle);
static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n)
}
EXPORT_SYMBOL(idle_notifier_unregister);
-enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
-static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
-
void enter_idle(void)
{
- __get_cpu_var(idle_state) = CPU_IDLE;
+ write_pda(isidle, 1);
atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
}
static void __exit_idle(void)
{
- __get_cpu_var(idle_state) = CPU_NOT_IDLE;
+ if (read_pda(isidle) == 0)
+ return;
+ write_pda(isidle, 0);
atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
}
/* Called from interrupts to signify idle end */
void exit_idle(void)
{
- if (current->pid | read_pda(irqcount))
+ /* idle loop has pid 0 */
+ if (current->pid)
return;
__exit_idle();
}
@@ -111,7 +111,7 @@ static void default_idle(void)
{
local_irq_enable();
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
while (!need_resched()) {
local_irq_disable();
@@ -120,7 +120,7 @@ static void default_idle(void)
else
local_irq_enable();
}
- set_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status |= TS_POLLING;
}
/*
@@ -203,8 +203,7 @@ static inline void play_dead(void)
*/
void cpu_idle (void)
{
- set_thread_flag(TIF_POLLING_NRFLAG);
-
+ current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
while (1) {
while (!need_resched()) {
@@ -221,6 +220,9 @@ void cpu_idle (void)
play_dead();
enter_idle();
idle();
+ /* In many cases the interrupt that ended idle
+ has already called exit_idle. But some idle
+ loops can be woken up without interrupt. */
__exit_idle();
}
@@ -297,7 +299,7 @@ void __show_regs(struct pt_regs * regs)
system_utsname.version);
printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
printk_address(regs->rip);
- printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
+ printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
regs->eflags);
printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->rax, regs->rbx, regs->rcx);
@@ -335,7 +337,7 @@ void show_regs(struct pt_regs *regs)
{
printk("CPU %d:", smp_processor_id());
__show_regs(regs);
- show_trace(&regs->rsp);
+ show_trace(NULL, regs, (void *)(regs + 1));
}
/*
@@ -351,6 +353,7 @@ void exit_thread(void)
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
@@ -365,8 +368,12 @@ void flush_thread(void)
struct task_struct *tsk = current;
struct thread_info *t = current_thread_info();
- if (t->flags & _TIF_ABI_PENDING)
+ if (t->flags & _TIF_ABI_PENDING) {
t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
+ if (t->flags & _TIF_IA32)
+ current_thread_info()->status |= TS_COMPAT;
+ }
+ t->flags &= ~_TIF_DEBUG;
tsk->thread.debugreg0 = 0;
tsk->thread.debugreg1 = 0;
@@ -459,7 +466,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
asm("mov %%es,%0" : "=m" (p->thread.es));
asm("mov %%ds,%0" : "=m" (p->thread.ds));
- if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
+ if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
@@ -467,6 +474,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
}
memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES);
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
@@ -496,6 +504,40 @@ out:
*/
#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
+static inline void __switch_to_xtra(struct task_struct *prev_p,
+ struct task_struct *next_p,
+ struct tss_struct *tss)
+{
+ struct thread_struct *prev, *next;
+
+ prev = &prev_p->thread,
+ next = &next_p->thread;
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ loaddebug(next, 0);
+ loaddebug(next, 1);
+ loaddebug(next, 2);
+ loaddebug(next, 3);
+ /* no 4 and 5 */
+ loaddebug(next, 6);
+ loaddebug(next, 7);
+ }
+
+ if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
+ /*
+ * Copy the relevant range of the IO bitmap.
+ * Normally this is 128 bytes or less:
+ */
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
+ max(prev->io_bitmap_max, next->io_bitmap_max));
+ } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
+ /*
+ * Clear any possible leftover bits:
+ */
+ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
+ }
+}
+
/*
* switch_to(x,y) should switch tasks from x to y.
*
@@ -513,6 +555,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ /* we're going to use this soon, after a few expensive things */
+ if (next_p->fpu_counter>5)
+ prefetch(&next->i387.fxsave);
+
/*
* Reload esp0, LDT and the page table pointer:
*/
@@ -581,41 +627,29 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
And the AMD workaround requires it to be after DS reload. */
unlazy_fpu(prev_p);
write_pda(kernelstack,
- task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
-
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+#ifdef CONFIG_CC_STACKPROTECTOR
+ write_pda(stack_canary, next_p->stack_canary);
/*
- * Now maybe reload the debug registers
+ * Build time only check to make sure the stack_canary is at
+ * offset 40 in the pda; this is a gcc ABI requirement
*/
- if (unlikely(next->debugreg7)) {
- loaddebug(next, 0);
- loaddebug(next, 1);
- loaddebug(next, 2);
- loaddebug(next, 3);
- /* no 4 and 5 */
- loaddebug(next, 6);
- loaddebug(next, 7);
- }
+ BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
+#endif
+ /*
+ * Now maybe reload the debug registers and handle I/O bitmaps
+ */
+ if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
+ || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+ __switch_to_xtra(prev_p, next_p, tss);
- /*
- * Handle the IO bitmap
- */
- if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
- if (next->io_bitmap_ptr)
- /*
- * Copy the relevant range of the IO bitmap.
- * Normally this is 128 bytes or less:
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
- else {
- /*
- * Clear any possible leftover bits:
- */
- memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
- }
- }
-
+ /* If the task has used fpu the last 5 timeslices, just do a full
+ * restore of the math state immediately to avoid the trap; the
+ * chances of needing FPU soon are obviously high now
+ */
+ if (next_p->fpu_counter>5)
+ math_state_restore();
return prev_p;
}
@@ -832,7 +866,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
unsigned long arch_align_stack(unsigned long sp)
{
- if (randomize_va_space)
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 2d50024c9f30..addc14af0c56 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r
return addr;
}
-static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
+static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
- unsigned char opcode[16];
+ unsigned char opcode[15];
unsigned long addr = convert_rip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
- /* popf */
- case 0x9d:
+ /* popf and iret */
+ case 0x9d: case 0xcf:
return 1;
/* CHECKME: 64 65 */
@@ -138,14 +138,17 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs)
case 0x26: case 0x2e:
case 0x36: case 0x3e:
case 0x64: case 0x65:
- case 0xf0: case 0xf2: case 0xf3:
+ case 0xf2: case 0xf3:
continue;
- /* REX prefixes */
case 0x40 ... 0x4f:
+ if (regs->cs != __USER_CS)
+ /* 32-bit mode: register increment */
+ return 0;
+ /* 64-bit mode: REX prefix */
continue;
- /* CHECKME: f0, f2, f3 */
+ /* CHECKME: f2, f3 */
/*
* pushf: NOTE! We should probably not let
@@ -186,10 +189,8 @@ static void set_singlestep(struct task_struct *child)
* ..but if TF is changed by the instruction we will trace,
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
- *
- * AK: this is not enough, LAHF and IRET can change TF in user space too.
*/
- if (is_at_popf(child, regs))
+ if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
@@ -420,9 +421,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
break;
if (i == 4) {
- child->thread.debugreg7 = data;
+ child->thread.debugreg7 = data;
+ if (data)
+ set_tsk_thread_flag(child, TIF_DEBUG);
+ else
+ clear_tsk_thread_flag(child, TIF_DEBUG);
ret = 0;
- }
+ }
break;
}
break;
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 57117b8beb2b..2d6769847456 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -20,6 +20,7 @@
* Power off function, if any
*/
void (*pm_power_off)(void);
+EXPORT_SYMBOL(pm_power_off);
static long no_idt[3];
static enum {
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
index d24fa9b72a2b..14e95872c6a3 100644
--- a/arch/x86_64/kernel/relocate_kernel.S
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -7,31 +7,169 @@
*/
#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
- /*
- * Must be relocatable PIC code callable as a C function, that once
- * it starts can not use the previous processes stack.
- */
- .globl relocate_new_kernel
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 3)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+
+ .text
+ .align PAGE_ALIGNED
.code64
+ .globl relocate_kernel
+relocate_kernel:
+ /* %rdi indirection_page
+ * %rsi page_list
+ * %rdx start address
+ */
+
+ /* map the control page at its virtual address */
+
+ movq $0x0000ff8000000000, %r10 /* mask */
+ mov $(39 - 3), %cl /* bits to shift */
+ movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PGD)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PUD_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PUD_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PMD_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PMD_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PTE_0)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PTE_0)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ /* identity map the control page at its physical address */
+
+ movq $0x0000ff8000000000, %r10 /* mask */
+ mov $(39 - 3), %cl /* bits to shift */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PGD)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PUD_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PUD_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PMD_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PMD_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_PTE_1)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
+ shrq $9, %r10
+ sub $9, %cl
+
+ movq %r11, %r9
+ andq %r10, %r9
+ shrq %cl, %r9
+
+ movq PTR(VA_PTE_1)(%rsi), %r8
+ addq %r8, %r9
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+ orq $PAGE_ATTR, %r8
+ movq %r8, (%r9)
+
relocate_new_kernel:
- /* %rdi page_list
- * %rsi reboot_code_buffer
+ /* %rdi indirection_page
+ * %rsi page_list
* %rdx start address
- * %rcx page_table
- * %r8 arg5
- * %r9 arg6
*/
/* zero out flags, and disable interrupts */
pushq $0
popfq
- /* set a new stack at the bottom of our page... */
- lea 4096(%rsi), %rsp
+ /* get physical address of control page now */
+ /* this is impossible after page table switch */
+ movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
+
+ /* get physical address of page table now too */
+ movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
- /* store the parameters back on the stack */
- pushq %rdx /* store the start address */
+ /* switch to new set of page tables */
+ movq PTR(PA_PGD)(%rsi), %r9
+ movq %r9, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%r8), %rsp
+
+ /* jump to identity mapped page */
+ addq $(identity_mapped - relocate_kernel), %r8
+ pushq %r8
+ ret
+
+identity_mapped:
+ /* store the start address on the stack */
+ pushq %rdx
/* Set cr0 to a known state:
* 31 1 == Paging enabled
@@ -136,8 +274,3 @@ relocate_new_kernel:
xorq %r15, %r15
ret
-relocate_new_kernel_end:
-
- .globl relocate_new_kernel_size
-relocate_new_kernel_size:
- .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index b8d5116d7371..fc944b5e8f4a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -5,8 +5,6 @@
*
* Nov 2001 Dave Jones <davej@suse.de>
* Forked from i386 setup code.
- *
- * $Id$
*/
/*
@@ -23,10 +21,9 @@
#include <linux/slab.h>
#include <linux/user.h>
#include <linux/a.out.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
#include <linux/ioport.h>
#include <linux/delay.h>
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/initrd.h>
#include <linux/highmem.h>
@@ -47,7 +44,6 @@
#include <linux/dmi.h>
#include <linux/dma-mapping.h>
#include <linux/ctype.h>
-#include <linux/suspend.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -66,9 +62,7 @@
#include <asm/setup.h>
#include <asm/mach_apic.h>
#include <asm/numa.h>
-#include <asm/swiotlb.h>
#include <asm/sections.h>
-#include <asm/gart-mapping.h>
#include <asm/dmi.h>
/*
@@ -76,19 +70,10 @@
*/
struct cpuinfo_x86 boot_cpu_data __read_mostly;
+EXPORT_SYMBOL(boot_cpu_data);
unsigned long mmu_cr4_features;
-int acpi_disabled;
-EXPORT_SYMBOL(acpi_disabled);
-#ifdef CONFIG_ACPI
-extern int __initdata acpi_ht;
-extern acpi_interrupt_flags acpi_sci_flags;
-int __initdata acpi_force = 0;
-#endif
-
-int acpi_numa __initdata;
-
/* Boot loader ID as an integer, for the benefit of proc_dointvec */
int bootloader_type;
@@ -104,13 +89,14 @@ char dmi_alloc_data[DMI_MAX_DATA];
* Setup options
*/
struct screen_info screen_info;
+EXPORT_SYMBOL(screen_info);
struct sys_desc_table_struct {
unsigned short length;
unsigned char table[0];
};
struct edid_info edid_info;
-struct e820map e820;
+EXPORT_SYMBOL_GPL(edid_info);
extern int root_mountflags;
@@ -137,9 +123,6 @@ struct resource standard_io_resources[] = {
.flags = IORESOURCE_BUSY | IORESOURCE_IO }
};
-#define STANDARD_IO_RESOURCES \
- (sizeof standard_io_resources / sizeof standard_io_resources[0])
-
#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM)
struct resource data_resource = {
@@ -186,9 +169,6 @@ static struct resource adapter_rom_resources[] = {
.flags = IORESOURCE_ROM }
};
-#define ADAPTER_ROM_RESOURCES \
- (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
-
static struct resource video_rom_resource = {
.name = "Video ROM",
.start = 0xc0000,
@@ -259,7 +239,8 @@ static void __init probe_roms(void)
}
/* check for adapter roms on 2k boundaries */
- for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper;
+ start += 2048) {
rom = isa_bus_to_virt(start);
if (!romsignature(rom))
continue;
@@ -279,185 +260,22 @@ static void __init probe_roms(void)
}
}
-/* Check for full argument with no trailing characters */
-static int fullarg(char *p, char *arg)
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel. This option will be passed
+ * by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
{
- int l = strlen(arg);
- return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l]));
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
}
-
-static __init void parse_cmdline_early (char ** cmdline_p)
-{
- char c = ' ', *to = command_line, *from = COMMAND_LINE;
- int len = 0;
- int userdef = 0;
-
- for (;;) {
- if (c != ' ')
- goto next_char;
-
-#ifdef CONFIG_SMP
- /*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
- else if (!memcmp(from, "maxcpus=", 8)) {
- extern unsigned int maxcpus;
-
- maxcpus = simple_strtoul(from + 8, NULL, 0);
- }
-#endif
-#ifdef CONFIG_ACPI
- /* "acpi=off" disables both ACPI table parsing and interpreter init */
- if (fullarg(from,"acpi=off"))
- disable_acpi();
-
- if (fullarg(from, "acpi=force")) {
- /* add later when we do DMI horrors: */
- acpi_force = 1;
- acpi_disabled = 0;
- }
-
- /* acpi=ht just means: do ACPI MADT parsing
- at bootup, but don't enable the full ACPI interpreter */
- if (fullarg(from, "acpi=ht")) {
- if (!acpi_force)
- disable_acpi();
- acpi_ht = 1;
- }
- else if (fullarg(from, "pci=noacpi"))
- acpi_disable_pci();
- else if (fullarg(from, "acpi=noirq"))
- acpi_noirq_set();
-
- else if (fullarg(from, "acpi_sci=edge"))
- acpi_sci_flags.trigger = 1;
- else if (fullarg(from, "acpi_sci=level"))
- acpi_sci_flags.trigger = 3;
- else if (fullarg(from, "acpi_sci=high"))
- acpi_sci_flags.polarity = 1;
- else if (fullarg(from, "acpi_sci=low"))
- acpi_sci_flags.polarity = 3;
-
- /* acpi=strict disables out-of-spec workarounds */
- else if (fullarg(from, "acpi=strict")) {
- acpi_strict = 1;
- }
-#ifdef CONFIG_X86_IO_APIC
- else if (fullarg(from, "acpi_skip_timer_override"))
- acpi_skip_timer_override = 1;
-#endif
-#endif
-
- if (fullarg(from, "disable_timer_pin_1"))
- disable_timer_pin_1 = 1;
- if (fullarg(from, "enable_timer_pin_1"))
- disable_timer_pin_1 = -1;
-
- if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) {
- clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
- disable_apic = 1;
- }
-
- if (fullarg(from, "noapic"))
- skip_ioapic_setup = 1;
-
- if (fullarg(from,"apic")) {
- skip_ioapic_setup = 0;
- ioapic_force = 1;
- }
-
- if (!memcmp(from, "mem=", 4))
- parse_memopt(from+4, &from);
-
- if (!memcmp(from, "memmap=", 7)) {
- /* exactmap option is for used defined memory */
- if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- saved_max_pfn = e820_end_of_ram();
-#endif
- from += 8+7;
- end_pfn_map = 0;
- e820.nr_map = 0;
- userdef = 1;
- }
- else {
- parse_memmapopt(from+7, &from);
- userdef = 1;
- }
- }
-
-#ifdef CONFIG_NUMA
- if (!memcmp(from, "numa=", 5))
- numa_setup(from+5);
-#endif
-
- if (!memcmp(from,"iommu=",6)) {
- iommu_setup(from+6);
- }
-
- if (fullarg(from,"oops=panic"))
- panic_on_oops = 1;
-
- if (!memcmp(from, "noexec=", 7))
- nonx_setup(from + 7);
-
-#ifdef CONFIG_KEXEC
- /* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel. By reserving this memory we guarantee
- * that linux never set's it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
- else if (!memcmp(from, "crashkernel=", 12)) {
- unsigned long size, base;
- size = memparse(from+12, &from);
- if (*from == '@') {
- base = memparse(from+1, &from);
- /* FIXME: Do I want a sanity check
- * to validate the memory range?
- */
- crashk_res.start = base;
- crashk_res.end = base + size - 1;
- }
- }
+early_param("elfcorehdr", setup_elfcorehdr);
#endif
-#ifdef CONFIG_PROC_VMCORE
- /* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel. This option will be passed
- * by kexec loader to the capture kernel.
- */
- else if(!memcmp(from, "elfcorehdr=", 11))
- elfcorehdr_addr = memparse(from+11, &from);
-#endif
-
-#ifdef CONFIG_HOTPLUG_CPU
- else if (!memcmp(from, "additional_cpus=", 16))
- setup_additional_cpus(from+16);
-#endif
-
- next_char:
- c = *(from++);
- if (!c)
- break;
- if (COMMAND_LINE_SIZE <= ++len)
- break;
- *(to++) = c;
- }
- if (userdef) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- e820_print_map("user");
- }
- *to = '\0';
- *cmdline_p = command_line;
-}
-
#ifndef CONFIG_NUMA
static void __init
contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
@@ -469,85 +287,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n",bootmap_size);
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
- e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT);
+ e820_register_active_regions(0, start_pfn, end_pfn);
+ free_bootmem_with_active_regions(0, end_pfn);
reserve_bootmem(bootmap, bootmap_size);
}
#endif
-/* Use inline assembly to define this because the nops are defined
- as inline assembly strings in the include files and we cannot
- get them easily into strings. */
-asm("\t.data\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8);
-
-extern unsigned char k8nops[];
-static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
- NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
-};
-
-extern char __vsyscall_0;
-
-/* Replace instructions with better alternatives for this CPU type.
-
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that assymetric systems where
- APs have less capabilities than the boot processor are not handled.
- In this case boot with "noreplacement". */
-void apply_alternatives(void *start, void *end)
-{
- struct alt_instr *a;
- int diff, i, k;
- for (a = start; (void *)a < end; a++) {
- u8 *instr;
-
- if (!boot_cpu_has(a->cpuid))
- continue;
-
- BUG_ON(a->replacementlen > a->instrlen);
- instr = a->instr;
- /* vsyscall code is not mapped yet. resolve it manually. */
- if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END)
- instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
- __inline_memcpy(instr, a->replacement, a->replacementlen);
- diff = a->instrlen - a->replacementlen;
-
- /* Pad the rest with nops */
- for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
- k = diff;
- if (k > ASM_NOP_MAX)
- k = ASM_NOP_MAX;
- __inline_memcpy(instr + i, k8_nops[k], k);
- }
- }
-}
-
-static int no_replacement __initdata = 0;
-
-void __init alternative_instructions(void)
-{
- extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
- if (no_replacement)
- return;
- apply_alternatives(__alt_instructions, __alt_instructions_end);
-}
-
-static int __init noreplacement_setup(char *s)
-{
- no_replacement = 1;
- return 1;
-}
-
-__setup("noreplacement", noreplacement_setup);
-
#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
struct edd edd;
#ifdef CONFIG_EDD_MODULE
@@ -596,103 +341,9 @@ static void discover_ebda(void)
ebda_size = 64*1024;
}
-#ifdef CONFIG_SOFTWARE_SUSPEND
-static void __init mark_nosave_page_range(unsigned long start, unsigned long end)
-{
- struct page *page;
- while (start <= end) {
- page = pfn_to_page(start);
- SetPageNosave(page);
- start++;
- }
-}
-
-static void __init e820_nosave_reserved_pages(void)
-{
- int i;
- unsigned long r_start = 0, r_end = 0;
-
- /* Assume e820 map is sorted */
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = round_down(ei->addr, PAGE_SIZE);
- end = round_up(ei->addr + ei->size, PAGE_SIZE);
- if (start >= end)
- continue;
- if (ei->type == E820_RESERVED)
- continue;
- r_end = start>>PAGE_SHIFT;
- /* swsusp ignores invalid pfn, ignore these pages here */
- if (r_end > end_pfn)
- r_end = end_pfn;
- if (r_end > r_start)
- mark_nosave_page_range(r_start, r_end-1);
- if (r_end >= end_pfn)
- break;
- r_start = end>>PAGE_SHIFT;
- }
-}
-
-static void __init e820_save_acpi_pages(void)
-{
- int i;
-
- /* Assume e820 map is sorted */
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = ei->addr, PAGE_SIZE;
- end = ei->addr + ei->size;
- if (start >= end)
- continue;
- if (ei->type != E820_ACPI && ei->type != E820_NVS)
- continue;
- /*
- * If the region is below end_pfn, it will be
- * saved/restored by swsusp follow 'RAM' type.
- */
- if (start < (end_pfn << PAGE_SHIFT))
- start = end_pfn << PAGE_SHIFT;
- if (end > start)
- swsusp_add_arch_pages(start, end);
- }
-}
-
-extern char __start_rodata, __end_rodata;
-/*
- * BIOS reserved region/hole - no save/restore
- * ACPI NVS - save/restore
- * ACPI Data - this is a little tricky, the mem could be used by OS after OS
- * reads tables from the region, but anyway save/restore the memory hasn't any
- * side effect and Linux runtime module load/unload might use it.
- * kernel rodata - no save/restore (kernel rodata isn't changed)
- */
-static int __init mark_nosave_pages(void)
-{
- unsigned long pfn_start, pfn_end;
-
- /* BIOS reserved regions & holes */
- e820_nosave_reserved_pages();
-
- /* kernel rodata */
- pfn_start = round_up(__pa_symbol(&__start_rodata), PAGE_SIZE) >> PAGE_SHIFT;
- pfn_end = round_down(__pa_symbol(&__end_rodata), PAGE_SIZE) >> PAGE_SHIFT;
- mark_nosave_page_range(pfn_start, pfn_end-1);
-
- /* record ACPI Data/NVS as saveable */
- e820_save_acpi_pages();
-
- return 0;
-}
-core_initcall(mark_nosave_pages);
-#endif
-
void __init setup_arch(char **cmdline_p)
{
- unsigned long kernel_end;
+ printk(KERN_INFO "Command line: %s\n", saved_command_line);
ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
screen_info = SCREEN_INFO;
@@ -720,16 +371,22 @@ void __init setup_arch(char **cmdline_p)
data_resource.start = virt_to_phys(&_etext);
data_resource.end = virt_to_phys(&_edata)-1;
- parse_cmdline_early(cmdline_p);
-
early_identify_cpu(&boot_cpu_data);
+ strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
+
+ parse_early_param();
+
+ finish_e820_parsing();
+
+ e820_register_active_regions(0, 0, -1UL);
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
end_pfn = e820_end_of_ram();
- num_physpages = end_pfn; /* for pfn_valid */
+ num_physpages = end_pfn;
check_efer();
@@ -749,6 +406,14 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
#endif
+ /* How many end-of-memory variables you have, grandma! */
+ max_low_pfn = end_pfn;
+ max_pfn = end_pfn;
+ high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1;
+
+ /* Remove active ranges so rediscovery with NUMA-awareness happens */
+ remove_all_active_ranges();
+
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
@@ -767,8 +432,8 @@ void __init setup_arch(char **cmdline_p)
(table_end - table_start) << PAGE_SHIFT);
/* reserve kernel */
- kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE);
- reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY);
+ reserve_bootmem_generic(__pa_symbol(&_text),
+ __pa_symbol(&_end) - __pa_symbol(&_text));
/*
* reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -798,12 +463,10 @@ void __init setup_arch(char **cmdline_p)
*/
acpi_reserve_bootmem();
#endif
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
-#endif
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) {
@@ -823,14 +486,16 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_KEXEC
if (crashk_res.start != crashk_res.end) {
- reserve_bootmem(crashk_res.start,
+ reserve_bootmem_generic(crashk_res.start,
crashk_res.end - crashk_res.start + 1);
}
#endif
paging_init();
- check_ioapic();
+#ifdef CONFIG_PCI
+ early_quirks();
+#endif
/*
* set this early, so we dont allocate cpu0
@@ -847,14 +512,12 @@ void __init setup_arch(char **cmdline_p)
init_cpu_to_node();
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
get_smp_config();
init_apic_mappings();
-#endif
/*
* Request address space for all standard RAM and ROM resources
@@ -862,22 +525,19 @@ void __init setup_arch(char **cmdline_p)
*/
probe_roms();
e820_reserve_resources();
+ e820_mark_nosave_regions();
request_resource(&iomem_resource, &video_ram_resource);
{
unsigned i;
/* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
request_resource(&ioport_resource, &standard_io_resources[i]);
}
e820_setup_gap();
-#ifdef CONFIG_GART_IOMMU
- iommu_hole_init();
-#endif
-
#ifdef CONFIG_VT
#if defined(CONFIG_VGA_CONSOLE)
conswitchp = &vga_con;
@@ -962,24 +622,32 @@ static int nearby_node(int apicid)
static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- int cpu = smp_processor_id();
unsigned bits;
#ifdef CONFIG_NUMA
+ int cpu = smp_processor_id();
int node = 0;
unsigned apicid = hard_smp_processor_id();
#endif
+ unsigned ecx = cpuid_ecx(0x80000008);
+
+ c->x86_max_cores = (ecx & 0xff) + 1;
- bits = 0;
- while ((1 << bits) < c->x86_max_cores)
- bits++;
+ /* CPU telling us the core id bits shift? */
+ bits = (ecx >> 12) & 0xF;
+
+ /* Otherwise recompute */
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
/* Low order bits define the core id (index of core in socket) */
- cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1);
+ c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
/* Convert the APIC ID into the socket ID */
- phys_proc_id[cpu] = phys_pkg_id(bits);
+ c->phys_proc_id = phys_pkg_id(bits);
#ifdef CONFIG_NUMA
- node = phys_proc_id[cpu];
+ node = c->phys_proc_id;
if (apicid_to_node[apicid] != NUMA_NO_NODE)
node = apicid_to_node[apicid];
if (!node_online(node)) {
@@ -992,7 +660,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
but in the same order as the HT nodeids.
If that doesn't result in a usable node fall back to the
path for the previous case. */
- int ht_nodeid = apicid - (phys_proc_id[0] << bits);
+ int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);
if (ht_nodeid >= 0 &&
apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = apicid_to_node[ht_nodeid];
@@ -1002,15 +670,13 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)
}
numa_set_node(cpu, node);
- printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n",
- cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]);
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
#endif
#endif
}
-static int __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
- int r;
unsigned level;
#ifdef CONFIG_SMP
@@ -1043,8 +709,8 @@ static int __init init_amd(struct cpuinfo_x86 *c)
if (c->x86 >= 6)
set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability);
- r = get_model_name(c);
- if (!r) {
+ level = get_model_name(c);
+ if (!level) {
switch (c->x86) {
case 15:
/* Should distinguish Models here, but this is only
@@ -1059,13 +725,18 @@ static int __init init_amd(struct cpuinfo_x86 *c)
if (c->x86_power & (1<<8))
set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
- if (c->extended_cpuid_level >= 0x80000008) {
- c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
-
+ /* Multi core CPU? */
+ if (c->extended_cpuid_level >= 0x80000008)
amd_detect_cmp(c);
- }
- return r;
+ /* Fix cpuid4 emulation for more */
+ num_cache_leaves = 3;
+
+ /* When there is only one core no need to synchronize RDTSC */
+ if (num_possible_cpus() == 1)
+ set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
+ else
+ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
}
static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -1073,13 +744,14 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
- int cpu = smp_processor_id();
cpuid(1, &eax, &ebx, &ecx, &edx);
- if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ if (!cpu_has(c, X86_FEATURE_HT))
return;
+ if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+ goto out;
smp_num_siblings = (ebx & 0xff0000) >> 16;
@@ -1094,10 +766,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
}
index_msb = get_count_order(smp_num_siblings);
- phys_proc_id[cpu] = phys_pkg_id(index_msb);
-
- printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- phys_proc_id[cpu]);
+ c->phys_proc_id = phys_pkg_id(index_msb);
smp_num_siblings = smp_num_siblings / c->x86_max_cores;
@@ -1105,13 +774,15 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
- cpu_core_id[cpu] = phys_pkg_id(index_msb) &
+ c->cpu_core_id = phys_pkg_id(index_msb) &
((1 << core_bits) - 1);
-
- if (c->x86_max_cores > 1)
- printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- cpu_core_id[cpu]);
}
+out:
+ if ((c->x86_max_cores * smp_num_siblings) > 1) {
+ printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id);
+ printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id);
+ }
+
#endif
}
@@ -1120,15 +791,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
*/
static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
- unsigned int eax;
+ unsigned int eax, t;
if (c->cpuid_level < 4)
return 1;
- __asm__("cpuid"
- : "=a" (eax)
- : "0" (4), "c" (0)
- : "bx", "dx");
+ cpuid_count(4, 0, &eax, &t, &t, &t);
if (eax & 0x1f)
return ((eax >> 26) + 1);
@@ -1141,16 +809,16 @@ static void srat_detect_node(void)
#ifdef CONFIG_NUMA
unsigned node;
int cpu = smp_processor_id();
+ int apicid = hard_smp_processor_id();
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
- node = apicid_to_node[hard_smp_processor_id()];
+ node = apicid_to_node[apicid];
if (node == NUMA_NO_NODE)
node = first_node(node_online_map);
numa_set_node(cpu, node);
- if (acpi_numa > 0)
- printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node);
+ printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
#endif
}
@@ -1160,6 +828,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
unsigned n;
init_intel_cacheinfo(c);
+ if (c->cpuid_level > 9 ) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability);
+ }
+
n = c->extended_cpuid_level;
if (n >= 0x80000008) {
unsigned eax = cpuid_eax(0x80000008);
@@ -1177,6 +852,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+ if (c->x86 == 6)
+ set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability);
set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability);
c->x86_max_cores = intel_num_cpu_cores(c);
@@ -1251,7 +928,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_SMP
- phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
#endif
}
@@ -1378,7 +1055,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL,
- NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow",
+ NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",
/* Transmeta-defined */
"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -1389,14 +1066,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Other (Linux-defined) */
"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,
"constant_tsc", NULL, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
- "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
@@ -1459,9 +1136,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
#ifdef CONFIG_SMP
if (smp_num_siblings * c->x86_max_cores > 1) {
int cpu = c - cpu_data;
- seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]);
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu]));
- seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]);
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
}
#endif
@@ -1535,7 +1212,7 @@ struct seq_operations cpuinfo_op = {
.show = show_cpuinfo,
};
-#ifdef CONFIG_INPUT_PCSPKR
+#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)
#include <linux/platform_device.h>
static __init int add_pcspkr(void)
{
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 8a691fa6d393..8c4b80fe71a1 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -3,9 +3,7 @@
* Copyright (C) 1995 Linus Torvalds
* Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.
* See setup.c for older changelog.
- * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -26,11 +24,12 @@
#include <asm/proto.h>
#include <asm/sections.h>
-char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
+char x86_boot_params[BOOT_PARAM_SIZE] __initdata;
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
@@ -38,6 +37,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
unsigned long __supported_pte_mask __read_mostly = ~0UL;
+EXPORT_SYMBOL(__supported_pte_mask);
static int do_not_nx __cpuinitdata = 0;
/* noexec=on|off
@@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes.
on Enable(default)
off Disable
*/
-int __init nonx_setup(char *str)
+static int __init nonx_setup(char *str)
{
+ if (!str)
+ return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
do_not_nx = 0;
@@ -55,9 +57,9 @@ int __init nonx_setup(char *str)
do_not_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
- return 1;
+ return 0;
}
-__setup("noexec=", nonx_setup); /* parsed early actually */
+early_param("noexec", nonx_setup);
int force_personality32 = 0;
@@ -93,12 +95,9 @@ void __init setup_per_cpu_areas(void)
#endif
/* Copy section for each CPU (we discard the original) */
- size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-#ifdef CONFIG_MODULES
- if (size < PERCPU_ENOUGH_ROOM)
- size = PERCPU_ENOUGH_ROOM;
-#endif
+ size = PERCPU_ENOUGH_ROOM;
+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
for_each_cpu_mask (i, cpu_possible_map) {
char *ptr;
@@ -122,7 +121,10 @@ void pda_init(int cpu)
/* Setup up data that may be needed in __get_free_pages early */
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
+ /* Memory clobbers used to order PDA accessed */
+ mb();
wrmsrl(MSR_GS_BASE, pda);
+ mb();
pda->cpunumber = cpu;
pda->irqcount = -1;
@@ -178,6 +180,8 @@ void __cpuinit check_efer(void)
}
}
+unsigned long kernel_eflags;
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -189,6 +193,7 @@ void __cpuinit cpu_init (void)
{
int cpu = stack_smp_processor_id();
struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
unsigned long v;
char *estacks = NULL;
struct task_struct *me;
@@ -234,29 +239,18 @@ void __cpuinit cpu_init (void)
* set up and load the per-CPU TSS
*/
for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+ static const unsigned int order[N_EXCEPTION_STACKS] = {
+ [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+ [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+ };
if (cpu) {
- static const unsigned int order[N_EXCEPTION_STACKS] = {
- [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
- [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
- };
-
estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
if (!estacks)
panic("Cannot allocate exception stack %ld %d\n",
v, cpu);
}
- switch (v + 1) {
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- case DEBUG_STACK:
- cpu_pda(cpu)->debugstack = (unsigned long)estacks;
- estacks += DEBUG_STKSZ;
- break;
-#endif
- default:
- estacks += EXCEPTION_STKSZ;
- break;
- }
- t->ist[v] = (unsigned long)estacks;
+ estacks += PAGE_SIZE << order[v];
+ orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks;
}
t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
@@ -289,4 +283,6 @@ void __cpuinit cpu_init (void)
set_debugreg(0UL, 7);
fpu_init();
+
+ raw_local_save_flags(kernel_eflags);
}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index e5f5ce7909a3..49ec324cd141 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -7,8 +7,6 @@
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
* 2000-2002 x86-64 support by Andi Kleen
- *
- * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $
*/
#include <linux/sched.h>
@@ -40,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
sigset_t *set, struct pt_regs * regs);
asmlinkage long
-sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs)
-{
- sigset_t saveset, newset;
-
- /* XXX: Don't preclude handling different sized sigset_t's. */
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&newset, unewset, sizeof(newset)))
- return -EFAULT;
- sigdelsetmask(&newset, ~_BLOCKABLE);
-
- spin_lock_irq(&current->sighand->siglock);
- saveset = current->blocked;
- current->blocked = newset;
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-#ifdef DEBUG_SIG
- printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n",
- saveset, newset, regs, regs->rip);
-#endif
- regs->rax = -EINTR;
- while (1) {
- current->state = TASK_INTERRUPTIBLE;
- schedule();
- if (do_signal(regs, &saveset))
- return -EINTR;
- }
-}
-
-asmlinkage long
sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
struct pt_regs *regs)
{
@@ -239,7 +206,6 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
rsp = regs->rsp - 128;
/* This is the X/Open sanctioned signal stack switching. */
- /* RED-PEN: redzone on that stack? */
if (ka->sa.sa_flags & SA_ONSTACK) {
if (sas_ss_flags(rsp) == 0)
rsp = current->sas_ss_sp + current->sas_ss_size;
@@ -311,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
#endif
/* Set up registers for signal handler */
- {
- struct exec_domain *ed = current_thread_info()->exec_domain;
- if (unlikely(ed && ed->signal_invmap && sig < 32))
- sig = ed->signal_invmap[sig];
- }
regs->rdi = sig;
/* In case the signal handler was declared without prototypes */
regs->rax = 0;
@@ -344,11 +305,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
current->comm, current->pid, frame, regs->rip, frame->pretcode);
#endif
- return 1;
+ return 0;
give_sigsegv:
force_sigsegv(sig, current);
- return 0;
+ return -EFAULT;
}
/*
@@ -411,7 +372,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
#endif
ret = setup_rt_frame(sig, ka, info, oldset, regs);
- if (ret) {
+ if (ret == 0) {
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -428,11 +389,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-int do_signal(struct pt_regs *regs, sigset_t *oldset)
+static void do_signal(struct pt_regs *regs)
{
struct k_sigaction ka;
siginfo_t info;
int signr;
+ sigset_t *oldset;
/*
* We want the common case to go fast, which
@@ -441,9 +403,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
* if so.
*/
if (!user_mode(regs))
- return 1;
+ return;
- if (!oldset)
+ if (test_thread_flag(TIF_RESTORE_SIGMASK))
+ oldset = &current->saved_sigmask;
+ else
oldset = &current->blocked;
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
@@ -457,30 +421,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
set_debugreg(current->thread.debugreg7, 7);
/* Whee! Actually deliver the signal. */
- return handle_signal(signr, &info, &ka, oldset, regs);
+ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
+ /* a signal was successfully delivered; the saved
+ * sigmask will have been stored in the signal frame,
+ * and will be restored by sigreturn, so we can simply
+ * clear the TIF_RESTORE_SIGMASK flag */
+ clear_thread_flag(TIF_RESTORE_SIGMASK);
+ }
+ return;
}
/* Did we come from a system call? */
if ((long)regs->orig_rax >= 0) {
/* Restart the system call - no handlers present */
long res = regs->rax;
- if (res == -ERESTARTNOHAND ||
- res == -ERESTARTSYS ||
- res == -ERESTARTNOINTR) {
+ switch (res) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
regs->rax = regs->orig_rax;
regs->rip -= 2;
- }
- if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) {
+ break;
+ case -ERESTART_RESTARTBLOCK:
regs->rax = test_thread_flag(TIF_IA32) ?
__NR_ia32_restart_syscall :
__NR_restart_syscall;
regs->rip -= 2;
+ break;
}
}
- return 0;
+
+ /* if there's no signal to deliver, we just put the saved sigmask
+ back. */
+ if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
+ clear_thread_flag(TIF_RESTORE_SIGMASK);
+ sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+ }
}
-void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags)
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
#ifdef DEBUG_SIG
printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n",
@@ -494,8 +474,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_
}
/* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs,oldset);
+ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
+ do_signal(regs);
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 4a6628b14d99..4f67697f5036 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
cpu = smp_processor_id();
/*
- * orig_rax contains the interrupt vector - 256.
+ * orig_rax contains the negated interrupt vector.
* Use that to determine where the sender put the data.
*/
- sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+ sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START;
f = &per_cpu(flush_state, sender);
if (!cpu_isset(cpu, f->flush_cpumask))
@@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void)
{
int i;
for_each_cpu_mask(i, cpu_possible_map) {
- spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
+ spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
}
return 0;
}
@@ -224,6 +224,7 @@ void flush_tlb_current_task(void)
flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
preempt_enable();
}
+EXPORT_SYMBOL(flush_tlb_current_task);
void flush_tlb_mm (struct mm_struct * mm)
{
@@ -244,6 +245,7 @@ void flush_tlb_mm (struct mm_struct * mm)
preempt_enable();
}
+EXPORT_SYMBOL(flush_tlb_mm);
void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
{
@@ -266,6 +268,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
preempt_enable();
}
+EXPORT_SYMBOL(flush_tlb_page);
static void do_flush_tlb_all(void* info)
{
@@ -443,6 +446,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
spin_unlock(&call_lock);
return 0;
}
+EXPORT_SYMBOL(smp_call_function);
void smp_stop_cpu(void)
{
@@ -460,7 +464,7 @@ static void smp_really_stop_cpu(void *dummy)
{
smp_stop_cpu();
for (;;)
- asm("hlt");
+ halt();
}
void smp_send_stop(void)
@@ -470,7 +474,7 @@ void smp_send_stop(void)
return;
/* Don't deadlock on the call lock in panic */
if (!spin_trylock(&call_lock)) {
- /* ignore locking because we have paniced anyways */
+ /* ignore locking because we have panicked anyways */
nolock = 1;
}
__smp_call_function(smp_really_stop_cpu, NULL, 0, 0);
@@ -518,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void)
}
}
-int safe_smp_processor_id(void)
-{
- int apicid, i;
-
- if (disable_apic)
- return 0;
-
- apicid = hard_smp_processor_id();
- if (x86_cpu_to_apicid[apicid] == apicid)
- return apicid;
-
- for (i = 0; i < NR_CPUS; ++i) {
- if (x86_cpu_to_apicid[i] == apicid)
- return i;
- }
-
- /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI,
- * or called too early. Either way, we must be CPU 0. */
- if (x86_cpu_to_apicid[0] == BAD_APICID)
- return 0;
-
- return 0; /* Should not happen */
-}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 71a7222cf9ce..7b7a6870288a 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -38,7 +38,6 @@
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -47,9 +46,10 @@
#include <linux/bootmem.h>
#include <linux/thread_info.h>
#include <linux/module.h>
-
#include <linux/delay.h>
#include <linux/mc146818rtc.h>
+#include <linux/smp.h>
+
#include <asm/mtrr.h>
#include <asm/pgalloc.h>
#include <asm/desc.h>
@@ -63,13 +63,11 @@
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
-/* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
-/* core ID of each logical CPU */
-u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+EXPORT_SYMBOL(cpu_llc_id);
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
@@ -82,18 +80,21 @@ EXPORT_SYMBOL(cpu_online_map);
*/
cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
+EXPORT_SYMBOL(cpu_callout_map);
cpumask_t cpu_possible_map;
EXPORT_SYMBOL(cpu_possible_map);
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_data);
/* Set when the idlers are all forked */
int smp_threads_ready;
/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_sibling_map);
/* representing HT and core siblings of each logical CPU */
cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
@@ -454,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu)
struct cpuinfo_x86 *c = cpu_data + cpu;
/*
* For perf, we return last level cache shared map.
- * TBD: when power saving sched policy is added, we will return
- * cpu_core_map when power saving policy is enabled
+ * And for power savings, we return cpu_core_map
*/
- return c->llc_shared_map;
+ if (sched_mc_power_savings || sched_smt_power_savings)
+ return cpu_core_map[cpu];
+ else
+ return c->llc_shared_map;
}
/* representing cpus for which sibling maps can be computed */
@@ -472,8 +475,8 @@ static inline void set_cpu_sibling_map(int cpu)
if (smp_num_siblings > 1) {
for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (phys_proc_id[cpu] == phys_proc_id[i] &&
- cpu_core_id[cpu] == cpu_core_id[i]) {
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+ c[cpu].cpu_core_id == c[i].cpu_core_id) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
@@ -500,7 +503,7 @@ static inline void set_cpu_sibling_map(int cpu)
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
/*
@@ -769,7 +772,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
unsigned long start_rip;
struct create_idle c_idle = {
.cpu = cpu,
- .done = COMPLETION_INITIALIZER(c_idle.done),
+ .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
};
DECLARE_WORK(work, do_fork_idle, &c_idle);
@@ -797,6 +800,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)
}
+ alternatives_smp_switch(1);
+
c_idle.idle = get_idle_for_cpu(cpu);
if (c_idle.idle) {
@@ -1086,7 +1091,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
/*
* Switch from PIC to APIC mode.
*/
- connect_bsp_APIC();
setup_local_APIC();
if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
@@ -1171,12 +1175,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
void __init smp_cpus_done(unsigned int max_cpus)
{
smp_cleanup_boot();
-
-#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
-#endif
-
check_nmi_watchdog();
+ time_init_gtod();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1199,8 +1200,8 @@ static void remove_siblinginfo(int cpu)
cpu_clear(cpu, cpu_sibling_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
- phys_proc_id[cpu] = BAD_APICID;
- cpu_core_id[cpu] = BAD_APICID;
+ c[cpu].phys_proc_id = 0;
+ c[cpu].cpu_core_id = 0;
cpu_clear(cpu, cpu_sibling_setup_map);
}
@@ -1229,6 +1230,8 @@ int __cpu_disable(void)
if (cpu == 0)
return -EBUSY;
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
/*
@@ -1259,6 +1262,8 @@ void __cpu_die(unsigned int cpu)
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
printk ("CPU %d is now offline\n", cpu);
+ if (1 == num_online_cpus())
+ alternatives_smp_switch(0);
return;
}
msleep(100);
@@ -1266,11 +1271,11 @@ void __cpu_die(unsigned int cpu)
printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
-__init int setup_additional_cpus(char *s)
+static __init int setup_additional_cpus(char *s)
{
- return get_option(&s, &additional_cpus);
+ return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
}
-__setup("additional_cpus=", setup_additional_cpus);
+early_param("additional_cpus", setup_additional_cpus);
#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c
new file mode 100644
index 000000000000..6026b31d037e
--- /dev/null
+++ b/arch/x86_64/kernel/stacktrace.c
@@ -0,0 +1,55 @@
+/*
+ * arch/x86_64/kernel/stacktrace.c
+ *
+ * Stack trace management functions
+ *
+ * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ */
+#include <linux/sched.h>
+#include <linux/stacktrace.h>
+#include <linux/module.h>
+#include <asm/stacktrace.h>
+
+static void save_stack_warning(void *data, char *msg)
+{
+}
+
+static void
+save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+}
+
+static int save_stack_stack(void *data, char *name)
+{
+ struct stack_trace *trace = (struct stack_trace *)data;
+ return trace->all_contexts ? 0 : -1;
+}
+
+static void save_stack_address(void *data, unsigned long addr)
+{
+ struct stack_trace *trace = (struct stack_trace *)data;
+ if (trace->skip > 0) {
+ trace->skip--;
+ return;
+ }
+ if (trace->nr_entries < trace->max_entries - 1)
+ trace->entries[trace->nr_entries++] = addr;
+}
+
+static struct stacktrace_ops save_stack_ops = {
+ .warning = save_stack_warning,
+ .warning_symbol = save_stack_warning_symbol,
+ .stack = save_stack_stack,
+ .address = save_stack_address,
+};
+
+/*
+ * Save stack-backtrace addresses into a stack_trace buffer.
+ */
+void save_stack_trace(struct stack_trace *trace, struct task_struct *task)
+{
+ dump_trace(task, NULL, NULL, &save_stack_ops, trace);
+ trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+EXPORT_SYMBOL(save_stack_trace);
+
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ecbd34c1093d..91f7e678bae7 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -7,7 +7,6 @@
* Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
*/
-#include <linux/config.h>
#include <linux/smp.h>
#include <linux/suspend.h>
#include <asm/proto.h>
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 320b6fb00cca..bfbe00763c68 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -54,7 +54,7 @@ ENTRY(restore_image)
movq %rcx, %cr3;
movq %rax, %cr4; # turn PGE back on
- movq pagedir_nosave(%rip), %rdx
+ movq restore_pblist(%rip), %rdx
loop:
testq %rdx, %rdx
jz done
diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c
index 7c176b3edde0..213fd6ab789d 100644
--- a/arch/x86_64/kernel/syscall.c
+++ b/arch/x86_64/kernel/syscall.c
@@ -3,7 +3,6 @@
#include <linux/linkage.h>
#include <linux/sys.h>
#include <linux/cache.h>
-#include <linux/config.h>
#define __NO_STUBS
diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
new file mode 100644
index 000000000000..cbabfdf78e06
--- /dev/null
+++ b/arch/x86_64/kernel/tce.c
@@ -0,0 +1,194 @@
+/*
+ * This file manages the translation entries for the IBM Calgary IOMMU.
+ *
+ * Derived from arch/powerpc/platforms/pseries/iommu.c
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author: Jon Mason <jdmason@us.ibm.com>
+ * Author: Muli Ben-Yehuda <muli@il.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/bootmem.h>
+#include <asm/tce.h>
+#include <asm/calgary.h>
+#include <asm/proto.h>
+
+/* flush a tce at 'tceaddr' to main memory */
+static inline void flush_tce(void* tceaddr)
+{
+ /* a single tce can't cross a cache line */
+ if (cpu_has_clflush)
+ asm volatile("clflush (%0)" :: "r" (tceaddr));
+ else
+ asm volatile("wbinvd":::"memory");
+}
+
+void tce_build(struct iommu_table *tbl, unsigned long index,
+ unsigned int npages, unsigned long uaddr, int direction)
+{
+ u64* tp;
+ u64 t;
+ u64 rpn;
+
+ t = (1 << TCE_READ_SHIFT);
+ if (direction != DMA_TO_DEVICE)
+ t |= (1 << TCE_WRITE_SHIFT);
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT;
+ t &= ~TCE_RPN_MASK;
+ t |= (rpn << TCE_RPN_SHIFT);
+
+ *tp = cpu_to_be64(t);
+ flush_tce(tp);
+
+ uaddr += PAGE_SIZE;
+ tp++;
+ }
+}
+
+void tce_free(struct iommu_table *tbl, long index, unsigned int npages)
+{
+ u64* tp;
+
+ tp = ((u64*)tbl->it_base) + index;
+
+ while (npages--) {
+ *tp = cpu_to_be64(0);
+ flush_tce(tp);
+ tp++;
+ }
+}
+
+static inline unsigned int table_size_to_number_of_entries(unsigned char size)
+{
+ /*
+ * size is the order of the table, 0-7
+ * smallest table is 8K entries, so shift result by 13 to
+ * multiply by 8K
+ */
+ return (1 << size) << 13;
+}
+
+static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl)
+{
+ unsigned int bitmapsz;
+ unsigned long bmppages;
+ int ret;
+
+ tbl->it_busno = dev->bus->number;
+
+ /* set the tce table size - measured in entries */
+ tbl->it_size = table_size_to_number_of_entries(specified_table_size);
+
+ /*
+ * number of bytes needed for the bitmap size in number of
+ * entries; we need one bit per entry
+ */
+ bitmapsz = tbl->it_size / BITS_PER_BYTE;
+ bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz));
+ if (!bmppages) {
+ printk(KERN_ERR "Calgary: cannot allocate bitmap\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ tbl->it_map = (unsigned long*)bmppages;
+
+ memset(tbl->it_map, 0, bitmapsz);
+
+ tbl->it_hint = 0;
+
+ spin_lock_init(&tbl->it_lock);
+
+ return 0;
+
+done:
+ return ret;
+}
+
+int build_tce_table(struct pci_dev *dev, void __iomem *bbar)
+{
+ struct iommu_table *tbl;
+ int ret;
+
+ if (dev->sysdata) {
+ printk(KERN_ERR "Calgary: dev %p has sysdata %p\n",
+ dev, dev->sysdata);
+ BUG();
+ }
+
+ tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL);
+ if (!tbl) {
+ printk(KERN_ERR "Calgary: error allocating iommu_table\n");
+ ret = -ENOMEM;
+ goto done;
+ }
+
+ ret = tce_table_setparms(dev, tbl);
+ if (ret)
+ goto free_tbl;
+
+ tbl->bbar = bbar;
+
+ /*
+ * NUMA is already using the bus's sysdata pointer, so we use
+ * the bus's pci_dev's sysdata instead.
+ */
+ dev->sysdata = tbl;
+
+ return 0;
+
+free_tbl:
+ kfree(tbl);
+done:
+ return ret;
+}
+
+void* alloc_tce_table(void)
+{
+ unsigned int size;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ return __alloc_bootmem_low(size, size, 0);
+}
+
+void free_tce_table(void *tbl)
+{
+ unsigned int size;
+
+ if (!tbl)
+ return;
+
+ size = table_size_to_number_of_entries(specified_table_size);
+ size *= TCE_ENTRY_SIZE;
+
+ free_bootmem(__pa(tbl), size);
+}
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 7392570f975d..7ea3bf2a858c 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -8,7 +8,7 @@
* Copyright (c) 1995 Markus Kuhn
* Copyright (c) 1996 Ingo Molnar
* Copyright (c) 1998 Andrea Arcangeli
- * Copyright (c) 2002 Vojtech Pavlik
+ * Copyright (c) 2002,2006 Vojtech Pavlik
* Copyright (c) 2003 Andi Kleen
* RTC support code taken from arch/i386/kernel/timers/time_hpet.c
*/
@@ -24,10 +24,13 @@
#include <linux/device.h>
#include <linux/sysdev.h>
#include <linux/bcd.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
#include <linux/kallsyms.h>
#include <linux/acpi.h>
#ifdef CONFIG_ACPI
#include <acpi/achware.h> /* for PM timer frequency */
+#include <acpi/acpi_bus.h>
#endif
#include <asm/8253pit.h>
#include <asm/pgtable.h>
@@ -38,9 +41,7 @@
#include <asm/sections.h>
#include <linux/cpufreq.h>
#include <linux/hpet.h>
-#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>
-#endif
#ifdef CONFIG_CPU_FREQ
static void cpufreq_delayed_get(void);
@@ -48,17 +49,24 @@ static void cpufreq_delayed_get(void);
extern void i8254_timer_resume(void);
extern int using_apic_timer;
-static char *time_init_gtod(void);
+static char *timename = NULL;
DEFINE_SPINLOCK(rtc_lock);
+EXPORT_SYMBOL(rtc_lock);
DEFINE_SPINLOCK(i8253_lock);
int nohpet __initdata = 0;
static int notsc __initdata = 0;
-#undef HPET_HACK_ENABLE_DANGEROUS
+#define USEC_PER_TICK (USEC_PER_SEC / HZ)
+#define NSEC_PER_TICK (NSEC_PER_SEC / HZ)
+#define FSEC_PER_TICK (FSEC_PER_SEC / HZ)
+
+#define NS_SCALE 10 /* 2^10, carefully chosen */
+#define US_SCALE 32 /* 2^32, arbitralrily chosen */
unsigned int cpu_khz; /* TSC clocks / usec, not used here */
+EXPORT_SYMBOL(cpu_khz);
static unsigned long hpet_period; /* fsecs / HPET clock */
unsigned long hpet_tick; /* HPET clocks / interrupt */
int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */
@@ -90,7 +98,7 @@ static inline unsigned int do_gettimeoffset_tsc(void)
t = get_cycles_sync();
if (t < vxtime.last_tsc)
t = vxtime.last_tsc; /* hack */
- x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
+ x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;
return x;
}
@@ -98,7 +106,7 @@ static inline unsigned int do_gettimeoffset_hpet(void)
{
/* cap counter read to one tick to avoid inconsistencies */
unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
- return (min(counter,hpet_tick) * vxtime.quot) >> 32;
+ return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;
}
unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
@@ -118,7 +126,7 @@ void do_gettimeofday(struct timeval *tv)
seq = read_seqbegin(&xtime_lock);
sec = xtime.tv_sec;
- usec = xtime.tv_nsec / 1000;
+ usec = xtime.tv_nsec / NSEC_PER_USEC;
/* i386 does some correction here to keep the clock
monotonous even when ntpd is fixing drift.
@@ -129,14 +137,14 @@ void do_gettimeofday(struct timeval *tv)
in arch/x86_64/kernel/vsyscall.c and export all needed
variables in vmlinux.lds. -AK */
- t = (jiffies - wall_jiffies) * (1000000L / HZ) +
+ t = (jiffies - wall_jiffies) * USEC_PER_TICK +
do_gettimeoffset();
usec += t;
} while (read_seqretry(&xtime_lock, seq));
- tv->tv_sec = sec + usec / 1000000;
- tv->tv_usec = usec % 1000000;
+ tv->tv_sec = sec + usec / USEC_PER_SEC;
+ tv->tv_usec = usec % USEC_PER_SEC;
}
EXPORT_SYMBOL(do_gettimeofday);
@@ -157,8 +165,8 @@ int do_settimeofday(struct timespec *tv)
write_seqlock_irq(&xtime_lock);
- nsec -= do_gettimeoffset() * 1000 +
- (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
+ nsec -= do_gettimeoffset() * NSEC_PER_USEC +
+ (jiffies - wall_jiffies) * NSEC_PER_TICK;
wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
@@ -179,20 +187,15 @@ unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- /* Assume the lock function has either no stack frame or only a single
- word. This checks if the address on the stack looks like a kernel
- text address.
- There is a small window for false hits, but in that case the tick
- is just accounted to the spinlock function.
- Better would be to write these functions in assembler again
- and check exactly. */
- if (in_lock_functions(pc)) {
- char *v = *(char **)regs->rsp;
- if ((v >= _stext && v <= _etext) ||
- (v >= _sinittext && v <= _einittext) ||
- (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END))
- return (unsigned long)v;
- return ((unsigned long *)regs->rsp)[1];
+ /* Assume the lock function has either no stack frame or a copy
+ of eflags from PUSHF
+ Eflags always has bits 22 and up cleared unlike kernel addresses. */
+ if (!user_mode(regs) && in_lock_functions(pc)) {
+ unsigned long *sp = (unsigned long *)regs->rsp;
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
}
return pc;
}
@@ -273,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime)
* Note: This function is required to return accurate
* time even in the absence of multiple timer ticks.
*/
+static inline unsigned long long cycles_2_ns(unsigned long long cyc);
unsigned long long monotonic_clock(void)
{
unsigned long seq;
@@ -288,7 +292,7 @@ unsigned long long monotonic_clock(void)
this_offset = hpet_readl(HPET_COUNTER);
} while (read_seqretry(&xtime_lock, seq));
offset = (this_offset - last_offset);
- offset *= (NSEC_PER_SEC/HZ) / hpet_tick;
+ offset *= NSEC_PER_TICK / hpet_tick;
} else {
do {
seq = read_seqbegin(&xtime_lock);
@@ -297,7 +301,7 @@ unsigned long long monotonic_clock(void)
base = monotonic_base;
} while (read_seqretry(&xtime_lock, seq));
this_offset = get_cycles_sync();
- offset = (this_offset - last_offset)*1000 / cpu_khz;
+ offset = cycles_2_ns(this_offset - last_offset);
}
return base + offset;
}
@@ -382,7 +386,7 @@ void main_timer_handler(struct pt_regs *regs)
}
monotonic_base +=
- (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
+ (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;
vxtime.last = offset;
#ifdef CONFIG_X86_PM_TIMER
@@ -391,36 +395,36 @@ void main_timer_handler(struct pt_regs *regs)
#endif
} else {
offset = (((tsc - vxtime.last_tsc) *
- vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
+ vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;
if (offset < 0)
offset = 0;
- if (offset > (USEC_PER_SEC / HZ)) {
- lost = offset / (USEC_PER_SEC / HZ);
- offset %= (USEC_PER_SEC / HZ);
+ if (offset > USEC_PER_TICK) {
+ lost = offset / USEC_PER_TICK;
+ offset %= USEC_PER_TICK;
}
- monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
+ monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc);
vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
if ((((tsc - vxtime.last_tsc) *
- vxtime.tsc_quot) >> 32) < offset)
+ vxtime.tsc_quot) >> US_SCALE) < offset)
vxtime.last_tsc = tsc -
- (((long) offset << 32) / vxtime.tsc_quot) - 1;
+ (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;
}
- if (lost > 0) {
+ if (lost > 0)
handle_lost_ticks(lost, regs);
- jiffies += lost;
- }
+ else
+ lost = 0;
/*
* Do the timer stuff.
*/
- do_timer(regs);
+ do_timer(lost + 1);
#ifndef CONFIG_SMP
update_process_times(user_mode(regs));
#endif
@@ -431,12 +435,8 @@ void main_timer_handler(struct pt_regs *regs)
* have to call the local interrupt handler.
*/
-#ifndef CONFIG_X86_LOCAL_APIC
- profile_tick(CPU_PROFILING, regs);
-#else
if (!using_apic_timer)
smp_local_timer_interrupt(regs);
-#endif
/*
* If we have an externally synchronized Linux clock, then update CMOS clock
@@ -460,24 +460,21 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
if (apic_runs_main_timer > 1)
return IRQ_HANDLED;
main_timer_handler(regs);
-#ifdef CONFIG_X86_LOCAL_APIC
if (using_apic_timer)
smp_send_timer_broadcast_ipi();
-#endif
return IRQ_HANDLED;
}
static unsigned int cyc2ns_scale __read_mostly;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
static inline void set_cyc2ns_scale(unsigned long cpu_khz)
{
- cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+ cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+ return (cyc * cyc2ns_scale) >> NS_SCALE;
}
unsigned long long sched_clock(void)
@@ -490,7 +487,7 @@ unsigned long long sched_clock(void)
Disadvantage is a small drift between CPUs in some configurations,
but that should be tolerable. */
if (__vxtime.mode == VXTIME_HPET)
- return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32;
+ return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;
#endif
/* Could do CPU core sync here. Opteron can execute rdtsc speculatively,
@@ -633,7 +630,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+ vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
}
set_cyc2ns_scale(cpu_khz_ref);
@@ -789,8 +786,8 @@ static int hpet_timer_stop_set_go(unsigned long tick)
if (hpet_use_timer) {
hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
HPET_TN_32BIT, HPET_T0_CFG);
- hpet_writel(hpet_tick, HPET_T0_CMP);
- hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */
+ hpet_writel(hpet_tick, HPET_T0_CMP); /* period */
cfg |= HPET_CFG_LEGACY;
}
/*
@@ -825,8 +822,7 @@ static int hpet_init(void)
if (hpet_period < 100000 || hpet_period > 100000000)
return -1;
- hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) /
- hpet_period;
+ hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;
hpet_use_timer = (id & HPET_ID_LEGSUP);
@@ -882,26 +878,20 @@ int __init time_setup(char *str)
}
static struct irqaction irq0 = {
- timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+ timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL
};
+static int __cpuinit
+time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+ unsigned cpu = (unsigned long) hcpu;
+ if (action == CPU_ONLINE)
+ vsyscall_set_cpu(cpu);
+ return NOTIFY_DONE;
+}
+
void __init time_init(void)
{
- char *timename;
- char *gtod;
-
-#ifdef HPET_HACK_ENABLE_DANGEROUS
- if (!vxtime.hpet_address) {
- printk(KERN_WARNING "time.c: WARNING: Enabling HPET base "
- "manually!\n");
- outl(0x800038a0, 0xcf8);
- outl(0xff000001, 0xcfc);
- outl(0x800038a0, 0xcf8);
- vxtime.hpet_address = inl(0xcfc) & 0xfffffffe;
- printk(KERN_WARNING "time.c: WARNING: Enabled HPET "
- "at %#lx.\n", vxtime.hpet_address);
- }
-#endif
if (nohpet)
vxtime.hpet_address = 0;
@@ -912,7 +902,7 @@ void __init time_init(void)
-xtime.tv_sec, -xtime.tv_nsec);
if (!hpet_init())
- vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period;
+ vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;
else
vxtime.hpet_address = 0;
@@ -935,18 +925,17 @@ void __init time_init(void)
}
vxtime.mode = VXTIME_TSC;
- gtod = time_init_gtod();
-
- printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
- vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);
- printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
- vxtime.quot = (1000000L << 32) / vxtime_hz;
- vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+ vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+ vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
vxtime.last_tsc = get_cycles_sync();
+ set_cyc2ns_scale(cpu_khz);
setup_irq(0, &irq0);
+ hotcpu_notifier(time_cpu_notifier, 0);
+ time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id());
- set_cyc2ns_scale(cpu_khz);
+#ifndef CONFIG_SMP
+ time_init_gtod();
+#endif
}
/*
@@ -956,13 +945,20 @@ void __init time_init(void)
__cpuinit int unsynchronized_tsc(void)
{
#ifdef CONFIG_SMP
- if (oem_force_hpet_timer())
+ if (apic_is_clustered_box())
return 1;
- /* Intel systems are normally all synchronized. Exceptions
- are handled in the OEM check above. */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- return 0;
#endif
+ /* Most intel systems have synchronized TSCs except for
+ multi node systems */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+#ifdef CONFIG_ACPI
+ /* But TSC doesn't tick in C3 so don't use it there */
+ if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100)
+ return 1;
+#endif
+ return 0;
+ }
+
/* Assume multi socket systems are not synchronized */
return num_present_cpus() > 1;
}
@@ -970,12 +966,18 @@ __cpuinit int unsynchronized_tsc(void)
/*
* Decide what mode gettimeofday should use.
*/
-__init static char *time_init_gtod(void)
+void time_init_gtod(void)
{
char *timetype;
if (unsynchronized_tsc())
notsc = 1;
+
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+ vgetcpu_mode = VGETCPU_RDTSCP;
+ else
+ vgetcpu_mode = VGETCPU_LSL;
+
if (vxtime.hpet_address && notsc) {
timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
if (hpet_use_timer)
@@ -998,7 +1000,16 @@ __init static char *time_init_gtod(void)
timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
vxtime.mode = VXTIME_TSC;
}
- return timetype;
+
+ printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n",
+ vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype);
+ printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",
+ cpu_khz / 1000, cpu_khz % 1000);
+ vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz;
+ vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;
+ vxtime.last_tsc = get_cycles_sync();
+
+ set_cyc2ns_scale(cpu_khz);
}
__setup("report_lost_ticks", time_setup);
@@ -1028,8 +1039,16 @@ static int timer_resume(struct sys_device *dev)
unsigned long flags;
unsigned long sec;
unsigned long ctime = get_cmos_time();
- unsigned long sleep_length = (ctime - sleep_start) * HZ;
+ long sleep_length = (ctime - sleep_start) * HZ;
+ if (sleep_length < 0) {
+ printk(KERN_WARNING "Time skew detected in timer resume!\n");
+ /* The time after the resume must not be earlier than the time
+ * before the suspend or some nasty things will happen
+ */
+ sleep_length = 0;
+ ctime = sleep_start;
+ }
if (vxtime.hpet_address)
hpet_reenable();
else
@@ -1145,23 +1164,25 @@ int hpet_rtc_timer_init(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
local_irq_save(flags);
+
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
- local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
+ local_irq_restore(flags);
+
return 1;
}
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, cnt;
+ unsigned int cfg, cnt, ticks_per_int, lost_ints;
if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
cfg = hpet_readl(HPET_T1_CFG);
@@ -1176,10 +1197,33 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_t1_cmp;
- cnt += hpet_tick*HZ/hpet_rtc_int_freq;
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
+ ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+ hpet_t1_cmp += ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ /*
+ * If the interrupt handler was delayed too long, the write above tries
+ * to schedule the next interrupt in the past and the hardware would
+ * not interrupt until the counter had wrapped around.
+ * So we have to check that the comparator wasn't set to a past time.
+ */
+ cnt = hpet_readl(HPET_COUNTER);
+ if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+ lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+ /* Make sure that, even with the time needed to execute
+ * this code, the next scheduled interrupt has been moved
+ * back to the future: */
+ lost_ints++;
+
+ hpet_t1_cmp += lost_ints * ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ if (PIE_on)
+ PIE_count += lost_ints;
+
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
+ }
}
/*
diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S
index 23a03eb91fc7..c79b99a9e2f6 100644
--- a/arch/x86_64/kernel/trampoline.S
+++ b/arch/x86_64/kernel/trampoline.S
@@ -64,7 +64,7 @@ idt_48:
.word 0, 0 # idt base = 0L
gdt_48:
- .short __KERNEL32_CS + 7 # gdt limit
+ .short GDT_ENTRIES*8 - 1 # gdt limit
.long cpu_gdt_table-__START_KERNEL_map
.globl trampoline_end
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index cea335e8746c..01f2a8d254c2 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -6,15 +6,12 @@
*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $
*/
/*
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'entry.S'.
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -31,6 +28,7 @@
#include <linux/nmi.h>
#include <linux/kprobes.h>
#include <linux/kexec.h>
+#include <linux/unwind.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -41,12 +39,13 @@
#include <asm/i387.h>
#include <asm/kdebug.h>
#include <asm/processor.h>
-
+#include <asm/unwind.h>
#include <asm/smp.h>
#include <asm/pgalloc.h>
#include <asm/pda.h>
#include <asm/proto.h>
#include <asm/nmi.h>
+#include <asm/stacktrace.h>
asmlinkage void divide_error(void);
asmlinkage void debug(void);
@@ -71,19 +70,20 @@ asmlinkage void machine_check(void);
asmlinkage void spurious_interrupt_bug(void);
ATOMIC_NOTIFIER_HEAD(die_chain);
+EXPORT_SYMBOL(die_chain);
int register_die_notifier(struct notifier_block *nb)
{
vmalloc_sync_all();
return atomic_notifier_chain_register(&die_chain, nb);
}
-EXPORT_SYMBOL(register_die_notifier);
+EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
int unregister_die_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&die_chain, nb);
}
-EXPORT_SYMBOL(unregister_die_notifier);
+EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
static inline void conditional_sti(struct pt_regs *regs)
{
@@ -107,35 +107,43 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_enable_no_resched();
}
-static int kstack_depth_to_print = 10;
+static int kstack_depth_to_print = 12;
+#ifdef CONFIG_STACK_UNWIND
+static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
#ifdef CONFIG_KALLSYMS
-#include <linux/kallsyms.h>
-int printk_address(unsigned long address)
-{
+# include <linux/kallsyms.h>
+void printk_address(unsigned long address)
+{
unsigned long offset = 0, symsize;
const char *symname;
char *modname;
- char *delim = ":";
+ char *delim = ":";
char namebuf[128];
- symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
- if (!symname)
- return printk("[<%016lx>]", address);
- if (!modname)
+ symname = kallsyms_lookup(address, &symsize, &offset,
+ &modname, namebuf);
+ if (!symname) {
+ printk(" [<%016lx>]\n", address);
+ return;
+ }
+ if (!modname)
modname = delim = "";
- return printk("<%016lx>{%s%s%s%s%+ld}",
- address, delim, modname, delim, symname, offset);
-}
+ printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n",
+ address, delim, modname, delim, symname, offset, symsize);
+}
#else
-int printk_address(unsigned long address)
-{
- return printk("[<%016lx>]", address);
-}
+void printk_address(unsigned long address)
+{
+ printk(" [<%016lx>]\n", address);
+}
#endif
static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
- unsigned *usedp, const char **idp)
+ unsigned *usedp, char **idp)
{
static char ids[][8] = {
[DEBUG_STACK - 1] = "#DB",
@@ -149,32 +157,49 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
};
unsigned k;
+ /*
+ * Iterate over all exception stacks, and figure out whether
+ * 'stack' is in one of them:
+ */
for (k = 0; k < N_EXCEPTION_STACKS; k++) {
- unsigned long end;
-
- switch (k + 1) {
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
- case DEBUG_STACK:
- end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ;
- break;
-#endif
- default:
- end = per_cpu(init_tss, cpu).ist[k];
- break;
- }
+ unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+ /*
+ * Is 'stack' above this exception frame's end?
+ * If yes then skip to the next frame.
+ */
if (stack >= end)
continue;
+ /*
+ * Is 'stack' above this exception frame's start address?
+ * If yes then we found the right frame.
+ */
if (stack >= end - EXCEPTION_STKSZ) {
+ /*
+ * Make sure we only iterate through an exception
+ * stack once. If it comes up for the second time
+ * then there's something wrong going on - just
+ * break out and return NULL:
+ */
if (*usedp & (1U << k))
break;
*usedp |= 1U << k;
*idp = ids[k];
return (unsigned long *)end;
}
+ /*
+ * If this is a debug stack, and if it has a larger size than
+ * the usual exception stacks, then 'stack' might still
+ * be within the lower portion of the debug stack:
+ */
#if DEBUG_STKSZ > EXCEPTION_STKSZ
if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
unsigned j = N_EXCEPTION_STACKS - 1;
+ /*
+ * Black magic. A large debug stack is composed of
+ * multiple exception stack entries, which we
+ * iterate through now. Dont look:
+ */
do {
++j;
end -= EXCEPTION_STKSZ;
@@ -191,6 +216,25 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
return NULL;
}
+struct ops_and_data {
+ struct stacktrace_ops *ops;
+ void *data;
+};
+
+static int dump_trace_unwind(struct unwind_frame_info *info, void *context)
+{
+ struct ops_and_data *oad = (struct ops_and_data *)context;
+ int n = 0;
+
+ while (unwind(info) == 0 && UNW_PC(info)) {
+ n++;
+ oad->ops->address(oad->data, UNW_PC(info));
+ if (arch_unw_user_mode(info))
+ break;
+ }
+ return n;
+}
+
/*
* x86-64 can have upto three kernel stacks:
* process stack
@@ -198,25 +242,66 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-void show_trace(unsigned long *stack)
+void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack,
+ struct stacktrace_ops *ops, void *data)
{
- const unsigned cpu = safe_smp_processor_id();
+ const unsigned cpu = smp_processor_id();
unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
- int i;
unsigned used = 0;
- printk("\nCall Trace:");
+ if (!tsk)
+ tsk = current;
+
+ if (call_trace >= 0) {
+ int unw_ret = 0;
+ struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
+
+ if (regs) {
+ if (unwind_init_frame_info(&info, tsk, regs) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ } else if (tsk == current)
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+ else {
+ if (unwind_init_blocked(&info, tsk) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ }
+ if (unw_ret > 0) {
+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+ UNW_PC(&info));
+ if ((long)UNW_SP(&info) < 0) {
+ ops->warning(data, "Leftover inexact backtrace:\n");
+ stack = (unsigned long *)UNW_SP(&info);
+ if (!stack)
+ return;
+ } else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else if (call_trace >= 1)
+ return;
+ else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (tsk && tsk != current)
+ stack = (unsigned long *)tsk->thread.rsp;
+ }
+ /*
+ * Print function call entries within a stack. 'cond' is the
+ * "end of stackframe" condition, that the 'stack++'
+ * iteration will eventually trigger.
+ */
#define HANDLE_STACK(cond) \
do while (cond) { \
unsigned long addr = *stack++; \
- if (kernel_text_address(addr)) { \
- if (i > 50) { \
- printk("\n "); \
- i = 0; \
- } \
- else \
- i += printk(" "); \
+ if (oops_in_progress ? \
+ __kernel_text_address(addr) : \
+ kernel_text_address(addr)) { \
/* \
* If the address is either in the text segment of the \
* kernel, or in the region which contains vmalloc'ed \
@@ -225,20 +310,31 @@ void show_trace(unsigned long *stack)
* down the cause of the crash will be able to figure \
* out the call path that was taken. \
*/ \
- i += printk_address(addr); \
+ ops->address(data, addr); \
} \
} while (0)
- for(i = 11; ; ) {
- const char *id;
+ /*
+ * Print function call entries in all stacks, starting at the
+ * current stack address. If the stacks consist of nested
+ * exceptions
+ */
+ for (;;) {
+ char *id;
unsigned long *estack_end;
estack_end = in_exception_stack(cpu, (unsigned long)stack,
&used, &id);
if (estack_end) {
- i += printk(" <%s>", id);
+ if (ops->stack(data, id) < 0)
+ break;
HANDLE_STACK (stack < estack_end);
- i += printk(" <EOE>");
+ ops->stack(data, "<EOE>");
+ /*
+ * We link to the next stack via the
+ * second-to-last pointer (index -2 to end) in the
+ * exception stack:
+ */
stack = (unsigned long *) estack_end[-2];
continue;
}
@@ -248,27 +344,75 @@ void show_trace(unsigned long *stack)
(IRQSTACKSIZE - 64) / sizeof(*irqstack);
if (stack >= irqstack && stack < irqstack_end) {
- i += printk(" <IRQ>");
+ if (ops->stack(data, "IRQ") < 0)
+ break;
HANDLE_STACK (stack < irqstack_end);
+ /*
+ * We link to the next stack (which would be
+ * the process stack normally) the last
+ * pointer (index -1 to end) in the IRQ stack:
+ */
stack = (unsigned long *) (irqstack_end[-1]);
irqstack_end = NULL;
- i += printk(" <EOI>");
+ ops->stack(data, "EOI");
continue;
}
}
break;
}
+ /*
+ * This handles the process stack:
+ */
HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
#undef HANDLE_STACK
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ print_symbol(msg, symbol);
printk("\n");
}
-void show_stack(struct task_struct *tsk, unsigned long * rsp)
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s\n", msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ printk(" <%s> ", name);
+ return 0;
+}
+
+static void print_trace_address(void *data, unsigned long addr)
+{
+ printk_address(addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
+
+void
+show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack)
+{
+ printk("\nCall Trace:\n");
+ dump_trace(tsk, regs, stack, &print_trace_ops, NULL);
+ printk("\n");
+}
+
+static void
+_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp)
{
unsigned long *stack;
int i;
- const int cpu = safe_smp_processor_id();
+ const int cpu = smp_processor_id();
unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
@@ -294,11 +438,16 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
break;
}
if (i && ((i % 4) == 0))
- printk("\n ");
- printk("%016lx ", *stack++);
+ printk("\n");
+ printk(" %016lx", *stack++);
touch_nmi_watchdog();
}
- show_trace((unsigned long *)rsp);
+ show_trace(tsk, regs, rsp);
+}
+
+void show_stack(struct task_struct *tsk, unsigned long * rsp)
+{
+ _show_stack(tsk, NULL, rsp);
}
/*
@@ -307,7 +456,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)
void dump_stack(void)
{
unsigned long dummy;
- show_trace(&dummy);
+ show_trace(NULL, NULL, &dummy);
}
EXPORT_SYMBOL(dump_stack);
@@ -317,7 +466,7 @@ void show_registers(struct pt_regs *regs)
int i;
int in_kernel = !user_mode(regs);
unsigned long rsp;
- const int cpu = safe_smp_processor_id();
+ const int cpu = smp_processor_id();
struct task_struct *cur = cpu_pda(cpu)->pcurrent;
rsp = regs->rsp;
@@ -334,7 +483,7 @@ void show_registers(struct pt_regs *regs)
if (in_kernel) {
printk("Stack: ");
- show_stack(NULL, (unsigned long*)rsp);
+ _show_stack(NULL, regs, (unsigned long*)rsp);
printk("\nCode: ");
if (regs->rip < PAGE_OFFSET)
@@ -383,6 +532,7 @@ void out_of_line_bug(void)
{
BUG();
}
+EXPORT_SYMBOL(out_of_line_bug);
#endif
static DEFINE_SPINLOCK(die_lock);
@@ -391,9 +541,11 @@ static unsigned int die_nest_count;
unsigned __kprobes long oops_begin(void)
{
- int cpu = safe_smp_processor_id();
+ int cpu = smp_processor_id();
unsigned long flags;
+ oops_enter();
+
/* racy, but better than risking deadlock. */
local_irq_save(flags);
if (!spin_trylock(&die_lock)) {
@@ -421,7 +573,8 @@ void __kprobes oops_end(unsigned long flags)
/* Nest count reaches zero, release the lock. */
spin_unlock_irqrestore(&die_lock, flags);
if (panic_on_oops)
- panic("Oops");
+ panic("Fatal exception");
+ oops_exit();
}
void __kprobes __die(const char * str, struct pt_regs * regs, long err)
@@ -458,7 +611,7 @@ void die(const char * str, struct pt_regs * regs, long err)
do_exit(SIGSEGV);
}
-void __kprobes die_nmi(char *str, struct pt_regs *regs)
+void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
unsigned long flags = oops_begin();
@@ -466,13 +619,12 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs)
* We are in trouble anyway, lets at least try
* to get a message out.
*/
- printk(str, safe_smp_processor_id());
+ printk(str, smp_processor_id());
show_registers(regs);
if (kexec_should_crash(current))
crash_kexec(regs);
- if (panic_on_timeout || panic_on_oops)
- panic("nmi watchdog");
- printk("console shuts up ...\n");
+ if (do_panic || panic_on_oops)
+ panic("Non maskable interrupt");
oops_end(flags);
nmi_exit();
local_irq_enable();
@@ -618,8 +770,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
static __kprobes void
mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "You probably have a hardware problem with your "
+ "RAM chips\n");
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
/* Clear and disable the memory parity error line. */
reason = (reason & 0xf) | 4;
@@ -642,9 +801,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
static __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
+{
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
+ reason);
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
/* Runs on IST stack. This code must keep interrupts off all the time.
@@ -664,17 +829,15 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog > 0) {
- nmi_watchdog_tick(regs,reason);
+ if (nmi_watchdog_tick(regs,reason))
return;
- }
-#endif
- unknown_nmi_error(reason, regs);
+ if (!do_nmi_callback(regs,cpu))
+ unknown_nmi_error(reason, regs);
+
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -959,6 +1122,7 @@ asmlinkage void math_state_restore(void)
init_fpu(me);
restore_fpu_checking(&me->thread.i387.fxsave);
task_thread_info(me)->status |= TS_USEDFPU;
+ me->fpu_counter++;
}
void __init trap_init(void)
@@ -997,18 +1161,39 @@ void __init trap_init(void)
}
-/* Actual parsing is done early in setup.c. */
-static int __init oops_dummy(char *s)
+static int __init oops_setup(char *s)
{
- panic_on_oops = 1;
- return 1;
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
}
-__setup("oops=", oops_dummy);
+early_param("oops", oops_setup);
static int __init kstack_setup(char *s)
{
+ if (!s)
+ return -EINVAL;
kstack_depth_to_print = simple_strtoul(s,NULL,0);
- return 1;
+ return 0;
}
-__setup("kstack=", kstack_setup);
+early_param("kstack", kstack_setup);
+#ifdef CONFIG_STACK_UNWIND
+static int __init call_trace_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (strcmp(s, "old") == 0)
+ call_trace = -1;
+ else if (strcmp(s, "both") == 0)
+ call_trace = 0;
+ else if (strcmp(s, "newfallback") == 0)
+ call_trace = 1;
+ else if (strcmp(s, "new") == 0)
+ call_trace = 2;
+ return 0;
+}
+early_param("call_trace", call_trace_setup);
+#endif
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index b81f473c4a19..f8aeccf105fa 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -6,7 +6,6 @@
#include <asm-generic/vmlinux.lds.h>
#include <asm/page.h>
-#include <linux/config.h>
#undef i386 /* in case the preprocessor is a 32bit one */
@@ -14,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+ user PT_LOAD FLAGS(7); /* RWE */
+ note PT_NOTE FLAGS(4); /* R__ */
+}
SECTIONS
{
. = __START_KERNEL;
@@ -32,7 +37,7 @@ SECTIONS
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
- } = 0x9090
+ } :text = 0x9090
/* out-of-line lock text */
.text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
@@ -45,21 +50,23 @@ SECTIONS
RODATA
+#ifdef CONFIG_STACK_UNWIND
+ . = ALIGN(8);
+ .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
+ __start_unwind = .;
+ *(.eh_frame)
+ __end_unwind = .;
+ }
+#endif
+
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
*(.data)
CONSTRUCTORS
- }
+ } :data
_edata = .; /* End of data section */
- __bss_start = .; /* BSS */
- .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
- *(.bss.page_aligned)
- *(.bss)
- }
- __bss_stop = .;
-
. = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
.data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
@@ -81,7 +88,7 @@ SECTIONS
#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
. = VSYSCALL_ADDR;
- .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
__vsyscall_0 = VSYSCALL_VIRT_ADDR;
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
@@ -91,6 +98,9 @@ SECTIONS
.vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
vxtime = VVIRT(.vxtime);
+ .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
+ vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
.wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
wall_jiffies = VVIRT(.wall_jiffies);
@@ -124,13 +134,33 @@ SECTIONS
. = ALIGN(8192); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
*(.data.init_task)
- }
+ } :data
. = ALIGN(4096);
.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
*(.data.page_aligned)
}
+ /* might get freed after init */
+ . = ALIGN(4096);
+ __smp_alt_begin = .;
+ __smp_alt_instructions = .;
+ .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
+ *(.smp_altinstructions)
+ }
+ __smp_alt_instructions_end = .;
+ . = ALIGN(8);
+ __smp_locks = .;
+ .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ *(.smp_locks)
+ }
+ __smp_locks_end = .;
+ .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
+ *(.smp_altinstr_replacement)
+ }
+ . = ALIGN(4096);
+ __smp_alt_end = .;
+
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
@@ -179,14 +209,12 @@ SECTIONS
__initramfs_start = .;
.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
- /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+
- complain */
- . = ALIGN(4096);
- __init_end = .;
- . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+ . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
__per_cpu_start = .;
.data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
+ . = ALIGN(4096);
+ __init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
@@ -194,6 +222,13 @@ SECTIONS
. = ALIGN(4096);
__nosave_end = .;
+ __bss_start = .; /* BSS */
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+ *(.bss.page_aligned)
+ *(.bss)
+ }
+ __bss_stop = .;
+
_end = . ;
/* Sections to be discarded */
diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c
index 92f70c74965f..044e852bd25e 100644
--- a/arch/x86_64/kernel/vsmp.c
+++ b/arch/x86_64/kernel/vsmp.c
@@ -20,6 +20,9 @@ static int __init vsmp_init(void)
void *address;
unsigned int cap, ctl;
+ if (!early_pci_allowed())
+ return 0;
+
/* Check if we are running on a ScaleMP vSMP box */
if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) ||
(read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 9468fb20b0bc..07c086382059 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -26,6 +26,7 @@
#include <linux/seqlock.h>
#include <linux/jiffies.h>
#include <linux/sysctl.h>
+#include <linux/getcpu.h>
#include <asm/vsyscall.h>
#include <asm/pgtable.h>
@@ -33,11 +34,15 @@
#include <asm/fixmap.h>
#include <asm/errno.h>
#include <asm/io.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
+#include <asm/topology.h>
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+int __vgetcpu_mode __section_vgetcpu_mode;
#include <asm/unistd.h>
@@ -72,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
__vxtime.tsc_quot) >> 32;
/* See comment in x86_64 do_gettimeofday. */
} else {
- usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
+ usec += ((readl((void __iomem *)
+ fix_to_virt(VSYSCALL_HPET) + 0xf0) -
__vxtime.last) * __vxtime.quot) >> 32;
}
} while (read_seqretry(&__xtime_lock, sequence));
@@ -107,7 +113,7 @@ static __always_inline long time_syscall(long *t)
int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
{
- if (unlikely(!__sysctl_vsyscall))
+ if (!__sysctl_vsyscall)
return gettimeofday(tv,tz);
if (tv)
do_vgettimeofday(tv);
@@ -120,16 +126,53 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
- if (unlikely(!__sysctl_vsyscall))
+ if (!__sysctl_vsyscall)
return time_syscall(t);
else if (t)
*t = __xtime.tv_sec;
return __xtime.tv_sec;
}
-long __vsyscall(2) venosys_0(void)
+/* Fast way to get current CPU and node.
+ This helps to do per node and per CPU caches in user space.
+ The result is not guaranteed without CPU affinity, but usually
+ works out because the scheduler tries to keep a thread on the same
+ CPU.
+
+ tcache must point to a two element sized long array.
+ All arguments can be NULL. */
+long __vsyscall(2)
+vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
{
- return -ENOSYS;
+ unsigned int dummy, p;
+ unsigned long j = 0;
+
+ /* Fast cache - only recompute value once per jiffies and avoid
+ relatively costly rdtscp/cpuid otherwise.
+ This works because the scheduler usually keeps the process
+ on the same CPU and this syscall doesn't guarantee its
+ results anyways.
+ We do this here because otherwise user space would do it on
+ its own in a likely inferior way (no access to jiffies).
+ If you don't like it pass NULL. */
+ if (tcache && tcache->blob[0] == (j = __jiffies)) {
+ p = tcache->blob[1];
+ } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+ rdtscp(dummy, dummy, p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ }
+ if (tcache) {
+ tcache->blob[0] = j;
+ tcache->blob[1] = p;
+ }
+ if (cpu)
+ *cpu = p & 0xfff;
+ if (node)
+ *node = p >> 12;
+ return 0;
}
long __vsyscall(3) venosys_1(void)
@@ -149,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
extern u16 vsysc1, vsysc2;
- u16 *map1, *map2;
+ u16 __iomem *map1;
+ u16 __iomem *map2;
int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
if (!write)
return ret;
@@ -164,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
goto out;
}
if (!sysctl_vsyscall) {
- *map1 = SYSCALL;
- *map2 = SYSCALL;
+ writew(SYSCALL, map1);
+ writew(SYSCALL, map2);
} else {
- *map1 = NOP2;
- *map2 = NOP2;
+ writew(NOP2, map1);
+ writew(NOP2, map2);
}
iounmap(map2);
out:
@@ -200,6 +244,43 @@ static ctl_table kernel_root_table2[] = {
#endif
+static void __cpuinit write_rdtscp_cb(void *info)
+{
+ write_rdtscp_aux((unsigned long)info);
+}
+
+void __cpuinit vsyscall_set_cpu(int cpu)
+{
+ unsigned long *d;
+ unsigned long node = 0;
+#ifdef CONFIG_NUMA
+ node = cpu_to_node[cpu];
+#endif
+ if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) {
+ void *info = (void *)((node << 12) | cpu);
+ /* Can happen on preemptive kernel */
+ if (get_cpu() == cpu)
+ write_rdtscp_cb(info);
+#ifdef CONFIG_SMP
+ else {
+ /* the notifier is unfortunately not executed on the
+ target CPU */
+ smp_call_function_single(cpu,write_rdtscp_cb,info,0,1);
+ }
+#endif
+ put_cpu();
+ }
+
+ /* Store cpu number in limit so that it can be loaded quickly
+ in user space in vgetcpu.
+ 12 bits for the CPU and 8 bits for the node. */
+ d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+ *d = 0x0f40000000000ULL;
+ *d |= cpu;
+ *d |= (node & 0xf) << 12;
+ *d |= (node >> 4) << 48;
+}
+
static void __init map_vsyscall(void)
{
extern char __vsyscall_0;
@@ -214,6 +295,7 @@ static int __init vsyscall_init(void)
VSYSCALL_ADDR(__NR_vgettimeofday)));
BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
+ BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
map_vsyscall();
#ifdef CONFIG_SYSCTL
register_sysctl_table(kernel_root_table2, 0);
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 1def21c9f7cd..c3454af5e3a2 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -1,66 +1,21 @@
+/* Exports for assembly files.
+ All C exports should go in the respective C files. */
+
#include <linux/config.h>
#include <linux/module.h>
#include <linux/smp.h>
-#include <linux/user.h>
-#include <linux/sched.h>
-#include <linux/in6.h>
-#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
-#include <linux/pm.h>
-#include <linux/pci.h>
-#include <linux/apm_bios.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/syscalls.h>
-#include <linux/tty.h>
#include <asm/semaphore.h>
#include <asm/processor.h>
-#include <asm/i387.h>
#include <asm/uaccess.h>
-#include <asm/checksum.h>
-#include <asm/io.h>
-#include <asm/delay.h>
-#include <asm/irq.h>
-#include <asm/mmx.h>
-#include <asm/desc.h>
#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/kdebug.h>
-#include <asm/unistd.h>
-#include <asm/tlbflush.h>
-#include <asm/kdebug.h>
-
-extern spinlock_t rtc_lock;
-#ifdef CONFIG_SMP
-extern void __write_lock_failed(rwlock_t *rw);
-extern void __read_lock_failed(rwlock_t *rw);
-#endif
-
-/* platform dependent support */
-EXPORT_SYMBOL(boot_cpu_data);
-//EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(__ioremap);
-EXPORT_SYMBOL(ioremap_nocache);
-EXPORT_SYMBOL(iounmap);
EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(pm_idle);
-EXPORT_SYMBOL(pm_power_off);
EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__down_failed_trylock);
EXPORT_SYMBOL(__up_wakeup);
-/* Networking helper routines. */
-EXPORT_SYMBOL(csum_partial_copy_nocheck);
-EXPORT_SYMBOL(ip_compute_csum);
-/* Delay loops */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__ndelay);
-EXPORT_SYMBOL(__delay);
-EXPORT_SYMBOL(__const_udelay);
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
@@ -71,42 +26,21 @@ EXPORT_SYMBOL(__put_user_2);
EXPORT_SYMBOL(__put_user_4);
EXPORT_SYMBOL(__put_user_8);
-EXPORT_SYMBOL(strncpy_from_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(clear_user);
-EXPORT_SYMBOL(__clear_user);
EXPORT_SYMBOL(copy_user_generic);
EXPORT_SYMBOL(copy_from_user);
EXPORT_SYMBOL(copy_to_user);
-EXPORT_SYMBOL(copy_in_user);
-EXPORT_SYMBOL(strnlen_user);
-
-#ifdef CONFIG_PCI
-EXPORT_SYMBOL(pci_mem_start);
-#endif
+EXPORT_SYMBOL(__copy_from_user_inatomic);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
-EXPORT_SYMBOL(_cpu_pda);
#ifdef CONFIG_SMP
-EXPORT_SYMBOL(cpu_data);
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
EXPORT_SYMBOL(__write_lock_failed);
EXPORT_SYMBOL(__read_lock_failed);
-
-EXPORT_SYMBOL(smp_call_function);
-EXPORT_SYMBOL(cpu_callout_map);
-#endif
-
-#ifdef CONFIG_VT
-EXPORT_SYMBOL(screen_info);
#endif
-EXPORT_SYMBOL(rtc_lock);
-
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
/* Export string functions. We normally rely on gcc builtin for most of these,
but gcc sometimes decides not to inline them. */
#undef memcpy
@@ -114,51 +48,14 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);
#undef memmove
extern void * memset(void *,int,__kernel_size_t);
-extern size_t strlen(const char *);
-extern void * memmove(void * dest,const void *src,size_t count);
extern void * memcpy(void *,const void *,__kernel_size_t);
extern void * __memcpy(void *,const void *,__kernel_size_t);
EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memmove);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-/* prototypes are wrong, these are assembly with custom calling functions */
-extern void rwsem_down_read_failed_thunk(void);
-extern void rwsem_wake_thunk(void);
-extern void rwsem_downgrade_thunk(void);
-extern void rwsem_down_write_failed_thunk(void);
-EXPORT_SYMBOL(rwsem_down_read_failed_thunk);
-EXPORT_SYMBOL(rwsem_wake_thunk);
-EXPORT_SYMBOL(rwsem_downgrade_thunk);
-EXPORT_SYMBOL(rwsem_down_write_failed_thunk);
-#endif
-
EXPORT_SYMBOL(empty_zero_page);
-
-EXPORT_SYMBOL(die_chain);
-
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(cpu_sibling_map);
-EXPORT_SYMBOL(smp_num_siblings);
-#endif
-
-#ifdef CONFIG_BUG
-EXPORT_SYMBOL(out_of_line_bug);
-#endif
-
EXPORT_SYMBOL(init_level4_pgt);
-
-extern unsigned long __supported_pte_mask;
-EXPORT_SYMBOL(__supported_pte_mask);
-
-#ifdef CONFIG_SMP
-EXPORT_SYMBOL(flush_tlb_page);
-#endif
-
-EXPORT_SYMBOL(cpu_khz);
-
EXPORT_SYMBOL(load_gs_index);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index ccef6ae747a3..b78d4170fce2 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
usercopy.o getuser.o putuser.o \
thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 1f81b79b796c..9a10a78bb4a4 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -1,10 +1,22 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
/*
* Zero a page.
* rdi page
*/
- .globl clear_page
- .p2align 4
-clear_page:
+ ALIGN
+clear_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ xorl %eax,%eax
+ rep stosq
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+ CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -23,28 +35,25 @@ clear_page:
jnz .Lloop
nop
ret
-clear_page_end:
+ CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
/* Some CPUs run faster using the string instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
+2:
+ .previous
.section .altinstructions,"a"
.align 8
- .quad clear_page
- .quad clear_page_c
- .byte X86_FEATURE_REP_GOOD
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-clear_page_c:
- movl $4096/8,%ecx
- xorl %eax,%eax
- rep
- stosq
- ret
-clear_page_c_end:
+ .quad clear_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lclear_page_end - clear_page
+ .byte 2b - 1b
.previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 8fa19d96a7ee..0ebb03b60e79 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -1,17 +1,33 @@
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+ ALIGN
+copy_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ rep movsq
+ ret
+ CFI_ENDPROC
+ENDPROC(copy_page_c)
+
/* Don't use streaming store because it's better when the target
ends up in cache. */
/* Could vary the prefetch distance based on SMP/UP */
- .globl copy_page
- .p2align 4
-copy_page:
+ENTRY(copy_page)
+ CFI_STARTPROC
subq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 3*8
movq %rbx,(%rsp)
+ CFI_REL_OFFSET rbx, 0
movq %r12,1*8(%rsp)
+ CFI_REL_OFFSET r12, 1*8
movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
movl $(4096/64)-5,%ecx
.p2align 4
@@ -72,30 +88,33 @@ copy_page:
jnz .Loop2
movq (%rsp),%rbx
+ CFI_RESTORE rbx
movq 1*8(%rsp),%r12
+ CFI_RESTORE r12
movq 2*8(%rsp),%r13
+ CFI_RESTORE r13
addq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -3*8
ret
+.Lcopy_page_end:
+ CFI_ENDPROC
+ENDPROC(copy_page)
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+2:
+ .previous
.section .altinstructions,"a"
.align 8
- .quad copy_page
- .quad copy_page_c
- .byte X86_FEATURE_REP_GOOD
- .byte copy_page_c_end-copy_page_c
- .byte copy_page_c_end-copy_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-copy_page_c:
- movl $4096/8,%ecx
- rep
- movsq
- ret
-copy_page_c_end:
+ .quad copy_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lcopy_page_end - copy_page
+ .byte 2b - 1b
.previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index f64569b83b54..70bebd310408 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,56 +4,78 @@
* Functions to copy from and to user space.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
#define FIX_ALIGNMENT 1
- #include <asm/current.h>
- #include <asm/asm-offsets.h>
- #include <asm/thread_info.h>
- #include <asm/cpufeature.h>
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
-/* Standard copy_to_user with segment limit checking */
- .globl copy_to_user
- .p2align 4
-copy_to_user:
- GET_THREAD_INFO(%rax)
- movq %rdi,%rcx
- addq %rdx,%rcx
- jc bad_to_user
- cmpq threadinfo_addr_limit(%rax),%rcx
- jae bad_to_user
-2:
+ .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
.byte 0xe9 /* 32bit jump */
- .long .Lcug-1f
+ .long \orig-1f /* by default jump to orig */
1:
-
.section .altinstr_replacement,"ax"
-3: .byte 0xe9 /* replacement jmp with 8 bit immediate */
- .long copy_user_generic_c-1b /* offset */
+2: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt-1b /* offset */ /* or alternatively to alt */
.previous
.section .altinstructions,"a"
.align 8
+ .quad 0b
.quad 2b
- .quad 3b
- .byte X86_FEATURE_REP_GOOD
+ .byte \feature /* when feature is set */
.byte 5
.byte 5
.previous
+ .endm
+
+/* Standard copy_to_user with segment limit checking */
+ENTRY(copy_to_user)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%rax)
+ movq %rdi,%rcx
+ addq %rdx,%rcx
+ jc bad_to_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_to_user
+ xorl %eax,%eax /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+ CFI_STARTPROC
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+ CFI_STARTPROC
+ xorl %ecx,%ecx /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
/* Standard copy_from_user with segment limit checking */
- .globl copy_from_user
- .p2align 4
-copy_from_user:
+ENTRY(copy_from_user)
+ CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
addq %rdx,%rcx
jc bad_from_user
cmpq threadinfo_addr_limit(%rax),%rcx
jae bad_from_user
- /* FALL THROUGH to copy_user_generic */
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+ENDPROC(copy_from_user)
.section .fixup,"ax"
/* must zero dest */
bad_from_user:
+ CFI_STARTPROC
movl %edx,%ecx
xorl %eax,%eax
rep
@@ -61,40 +83,32 @@ bad_from_user:
bad_to_user:
movl %edx,%eax
ret
+ CFI_ENDPROC
+END(bad_from_user)
.previous
/*
- * copy_user_generic - memory copy with exception handling.
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
*
* Input:
* rdi destination
* rsi source
* rdx count
+ * ecx zero flag -- if true zero destination on error
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
- .globl copy_user_generic
- .p2align 4
-copy_user_generic:
- .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
- .byte 0x66,0x90
-1:
- .section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long copy_user_generic_c-1b /* offset */
- .previous
- .section .altinstructions,"a"
- .align 8
- .quad copy_user_generic
- .quad 2b
- .byte X86_FEATURE_REP_GOOD
- .byte 5
- .byte 5
- .previous
-.Lcug:
+ENTRY(copy_user_generic_unrolled)
+ CFI_STARTPROC
pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
xorl %eax,%eax /*zero for the exception handler */
#ifdef FIX_ALIGNMENT
@@ -168,9 +182,16 @@ copy_user_generic:
decl %ecx
jnz .Lloop_1
+ CFI_REMEMBER_STATE
.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
ret
+ CFI_RESTORE_STATE
#ifdef FIX_ALIGNMENT
/* align destination */
@@ -252,6 +273,8 @@ copy_user_generic:
addl %ecx,%edx
/* edx: bytes to zero, rdi: dest, eax:zero */
.Lzero_rest:
+ cmpl $0,(%rsp)
+ jz .Le_zero
movq %rdx,%rcx
.Le_byte:
xorl %eax,%eax
@@ -261,6 +284,9 @@ copy_user_generic:
.Le_zero:
movq %rdx,%rax
jmp .Lende
+ CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
/* Some CPUs run faster using the string copy instructions.
This is also a lot simpler. Use them when possible.
@@ -270,6 +296,7 @@ copy_user_generic:
/* rdi destination
* rsi source
* rdx count
+ * ecx zero flag
*
* Output:
* eax uncopied bytes or 0 if successfull.
@@ -280,22 +307,48 @@ copy_user_generic:
* And more would be dangerous because both Intel and AMD have
* errata with rep movsq > 4GB. If someone feels the need to fix
* this please consider this.
- */
-copy_user_generic_c:
+ */
+ENTRY(copy_user_generic_string)
+ CFI_STARTPROC
+ movl %ecx,%r8d /* save zero flag */
movl %edx,%ecx
shrl $3,%ecx
andl $7,%edx
+ jz 10f
1: rep
movsq
movl %edx,%ecx
2: rep
movsb
-4: movl %ecx,%eax
+9: movl %ecx,%eax
ret
-3: lea (%rdx,%rcx,8),%rax
+
+ /* multiple of 8 byte */
+10: rep
+ movsq
+ xor %eax,%eax
ret
+ /* exception handling */
+3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
+ jmp 6f
+5: movl %ecx,%eax /* exception on byte loop */
+ /* eax: left over bytes */
+6: testl %r8d,%r8d /* zero flag set? */
+ jz 7f
+ movl %eax,%ecx /* initialize x86 loop counter */
+ push %rax
+ xorl %eax,%eax
+8: rep
+ stosb /* zero the rest */
+11: pop %rax
+7: ret
+ CFI_ENDPROC
+END(copy_user_generic_c)
+
.section __ex_table,"a"
.quad 1b,3b
- .quad 2b,4b
+ .quad 2b,5b
+ .quad 8b,11b
+ .quad 10b,3b
.previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
index 72fd55ee896e..f0dba36578ea 100644
--- a/arch/x86_64/lib/csum-copy.S
+++ b/arch/x86_64/lib/csum-copy.S
@@ -5,8 +5,9 @@
* License. See the file COPYING in the main directory of this archive
* for more details. No warranty for anything given at all.
*/
- #include <linux/linkage.h>
- #include <asm/errno.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
/*
* Checksum copy with exception handling.
@@ -53,19 +54,24 @@
.endm
- .globl csum_partial_copy_generic
- .p2align 4
-csum_partial_copy_generic:
+ENTRY(csum_partial_copy_generic)
+ CFI_STARTPROC
cmpl $3*64,%edx
jle .Lignore
.Lignore:
subq $7*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 7*8
movq %rbx,2*8(%rsp)
+ CFI_REL_OFFSET rbx, 2*8
movq %r12,3*8(%rsp)
+ CFI_REL_OFFSET r12, 3*8
movq %r14,4*8(%rsp)
+ CFI_REL_OFFSET r14, 4*8
movq %r13,5*8(%rsp)
+ CFI_REL_OFFSET r13, 5*8
movq %rbp,6*8(%rsp)
+ CFI_REL_OFFSET rbp, 6*8
movq %r8,(%rsp)
movq %r9,1*8(%rsp)
@@ -208,14 +214,22 @@ csum_partial_copy_generic:
addl %ebx,%eax
adcl %r9d,%eax /* carry */
+ CFI_REMEMBER_STATE
.Lende:
movq 2*8(%rsp),%rbx
+ CFI_RESTORE rbx
movq 3*8(%rsp),%r12
+ CFI_RESTORE r12
movq 4*8(%rsp),%r14
+ CFI_RESTORE r14
movq 5*8(%rsp),%r13
+ CFI_RESTORE r13
movq 6*8(%rsp),%rbp
+ CFI_RESTORE rbp
addq $7*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -7*8
ret
+ CFI_RESTORE_STATE
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
@@ -231,3 +245,5 @@ csum_partial_copy_generic:
jz .Lende
movl $-EFAULT,(%rax)
jmp .Lende
+ CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
index 5384e227cdf6..c493735218da 100644
--- a/arch/x86_64/lib/csum-partial.c
+++ b/arch/x86_64/lib/csum-partial.c
@@ -147,4 +147,5 @@ unsigned short ip_compute_csum(unsigned char * buff, int len)
{
return csum_fold(csum_partial(buff,len,0));
}
+EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
index 94323f20816e..b1320ec58428 100644
--- a/arch/x86_64/lib/csum-wrappers.c
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -109,6 +109,7 @@ csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len,
{
return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
__u32 len, unsigned short proto, unsigned int sum)
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
index 03c460cbdd1c..b6cd3cca2f45 100644
--- a/arch/x86_64/lib/delay.c
+++ b/arch/x86_64/lib/delay.c
@@ -9,6 +9,7 @@
*/
#include <linux/config.h>
+#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <asm/delay.h>
@@ -36,18 +37,22 @@ void __delay(unsigned long loops)
}
while((now-bclock) < loops);
}
+EXPORT_SYMBOL(__delay);
inline void __const_udelay(unsigned long xloops)
{
__delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32);
}
+EXPORT_SYMBOL(__const_udelay);
void __udelay(unsigned long usecs)
{
__const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
}
+EXPORT_SYMBOL(__udelay);
void __ndelay(unsigned long nsecs)
{
__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
index 3844d5e885a4..5448876261f8 100644
--- a/arch/x86_64/lib/getuser.S
+++ b/arch/x86_64/lib/getuser.S
@@ -27,25 +27,26 @@
*/
#include <linux/linkage.h>
+#include <asm/dwarf2.h>
#include <asm/page.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
.text
- .p2align 4
-.globl __get_user_1
-__get_user_1:
+ENTRY(__get_user_1)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
cmpq threadinfo_addr_limit(%r8),%rcx
jae bad_get_user
1: movzb (%rcx),%edx
xorl %eax,%eax
ret
+ CFI_ENDPROC
+ENDPROC(__get_user_1)
- .p2align 4
-.globl __get_user_2
-__get_user_2:
+ENTRY(__get_user_2)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $1,%rcx
jc 20f
@@ -57,10 +58,11 @@ __get_user_2:
ret
20: decq %rcx
jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_2)
- .p2align 4
-.globl __get_user_4
-__get_user_4:
+ENTRY(__get_user_4)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $3,%rcx
jc 30f
@@ -72,10 +74,11 @@ __get_user_4:
ret
30: subq $3,%rcx
jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_4)
- .p2align 4
-.globl __get_user_8
-__get_user_8:
+ENTRY(__get_user_8)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $7,%rcx
jc 40f
@@ -87,11 +90,16 @@ __get_user_8:
ret
40: subq $7,%rcx
jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_8)
bad_get_user:
+ CFI_STARTPROC
xorl %edx,%edx
movq $(-EFAULT),%rax
ret
+ CFI_ENDPROC
+END(bad_get_user)
.section __ex_table,"a"
.quad 1b,bad_get_user
diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S
index 8bbade5fea05..05a95e713da8 100644
--- a/arch/x86_64/lib/iomap_copy.S
+++ b/arch/x86_64/lib/iomap_copy.S
@@ -15,12 +15,16 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
/*
* override generic version in lib/iomap_copy.c
*/
- .globl __iowrite32_copy
- .p2align 4
-__iowrite32_copy:
+ENTRY(__iowrite32_copy)
+ CFI_STARTPROC
movl %edx,%ecx
rep movsd
ret
+ CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 5554948b5554..967b22fa7d07 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -1,6 +1,10 @@
/* Copyright 2002 Andi Kleen */
- #include <asm/cpufeature.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
/*
* memcpy - Copy a memory block.
*
@@ -13,12 +17,26 @@
* rax original destination
*/
- .globl __memcpy
- .globl memcpy
- .p2align 4
-__memcpy:
-memcpy:
+ ALIGN
+memcpy_c:
+ CFI_STARTPROC
+ movq %rdi,%rax
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ rep movsq
+ movl %edx,%ecx
+ rep movsb
+ ret
+ CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ CFI_STARTPROC
pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
movq %rdi,%rax
movl %edx,%ecx
@@ -86,36 +104,27 @@ memcpy:
.Lende:
popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
ret
.Lfinal:
+ CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
/* Some CPUs run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
+2:
+ .previous
.section .altinstructions,"a"
.align 8
- .quad memcpy
- .quad memcpy_c
- .byte X86_FEATURE_REP_GOOD
- .byte .Lfinal-memcpy
- .byte memcpy_c_end-memcpy_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi source
- * rdx count
- */
-memcpy_c:
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
- rep
- movsq
- movl %edx,%ecx
- rep
- movsb
- ret
-memcpy_c_end:
+ .quad memcpy
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lfinal - memcpy
+ .byte 2b - 1b
.previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
index e93d5255fdc9..751ebae8ec42 100644
--- a/arch/x86_64/lib/memmove.c
+++ b/arch/x86_64/lib/memmove.c
@@ -3,12 +3,13 @@
*/
#define _STRING_C
#include <linux/string.h>
+#include <linux/module.h>
#undef memmove
void *memmove(void * dest,const void *src,size_t count)
{
if (dest < src) {
- __inline_memcpy(dest,src,count);
+ return memcpy(dest,src,count);
} else {
char *p = (char *) dest + count;
char *s = (char *) src + count;
@@ -17,3 +18,4 @@ void *memmove(void * dest,const void *src,size_t count)
}
return dest;
}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index ad397f2c7de8..09ed1f6b0eaa 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,4 +1,9 @@
/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
/*
* ISO C memset - set a memory block to a byte value.
*
@@ -8,11 +13,29 @@
*
* rax original destination
*/
- .globl __memset
- .globl memset
- .p2align 4
-memset:
-__memset:
+ ALIGN
+memset_c:
+ CFI_STARTPROC
+ movq %rdi,%r9
+ movl %edx,%r8d
+ andl $7,%r8d
+ movl %edx,%ecx
+ shrl $3,%ecx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ mulq %rsi /* with rax, clobbers rdx */
+ rep stosq
+ movl %r8d,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+ CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+ CFI_STARTPROC
movq %rdi,%r10
movq %rdx,%r11
@@ -25,6 +48,7 @@ __memset:
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
+ CFI_REMEMBER_STATE
.Lafter_bad_alignment:
movl %r11d,%ecx
@@ -75,6 +99,7 @@ __memset:
movq %r10,%rax
ret
+ CFI_RESTORE_STATE
.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
@@ -84,42 +109,26 @@ __memset:
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment
+.Lfinal:
+ CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
/* Some CPUs run faster using the string instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (memset_c - memset) - (2f - 1b) /* offset */
+2:
+ .previous
.section .altinstructions,"a"
.align 8
- .quad memset
- .quad memset_c
- .byte X86_FEATURE_REP_GOOD
- .byte memset_c_end-memset_c
- .byte memset_c_end-memset_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi value
- * rdx count
- */
-memset_c:
- movq %rdi,%r9
- movl %edx,%r8d
- andl $7,%r8d
- movl %edx,%ecx
- shrl $3,%ecx
- /* expand byte value */
- movzbl %sil,%esi
- movabs $0x0101010101010101,%rax
- mulq %rsi /* with rax, clobbers rdx */
- rep
- stosq
- movl %r8d,%ecx
- rep
- stosb
- movq %r9,%rax
- ret
-memset_c_end:
+ .quad memset
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lfinal - memset
+ .byte 2b - 1b
.previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
index 7f5593974e2d..4989f5a8fa9b 100644
--- a/arch/x86_64/lib/putuser.S
+++ b/arch/x86_64/lib/putuser.S
@@ -25,25 +25,26 @@
*/
#include <linux/linkage.h>
+#include <asm/dwarf2.h>
#include <asm/page.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
.text
- .p2align 4
-.globl __put_user_1
-__put_user_1:
+ENTRY(__put_user_1)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
cmpq threadinfo_addr_limit(%r8),%rcx
jae bad_put_user
1: movb %dl,(%rcx)
xorl %eax,%eax
ret
+ CFI_ENDPROC
+ENDPROC(__put_user_1)
- .p2align 4
-.globl __put_user_2
-__put_user_2:
+ENTRY(__put_user_2)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $1,%rcx
jc 20f
@@ -55,10 +56,11 @@ __put_user_2:
ret
20: decq %rcx
jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_2)
- .p2align 4
-.globl __put_user_4
-__put_user_4:
+ENTRY(__put_user_4)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $3,%rcx
jc 30f
@@ -70,10 +72,11 @@ __put_user_4:
ret
30: subq $3,%rcx
jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_4)
- .p2align 4
-.globl __put_user_8
-__put_user_8:
+ENTRY(__put_user_8)
+ CFI_STARTPROC
GET_THREAD_INFO(%r8)
addq $7,%rcx
jc 40f
@@ -85,10 +88,15 @@ __put_user_8:
ret
40: subq $7,%rcx
jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_8)
bad_put_user:
+ CFI_STARTPROC
movq $(-EFAULT),%rax
ret
+ CFI_ENDPROC
+END(bad_put_user)
.section __ex_table,"a"
.quad 1b,bad_put_user
diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86_64/lib/rwlock.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+
+/* rdi: pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ LOCK_PREFIX
+ addl $RW_LOCK_BIAS,(%rdi)
+1: rep
+ nop
+ cmpl $RW_LOCK_BIAS,(%rdi)
+ jne 1b
+ LOCK_PREFIX
+ subl $RW_LOCK_BIAS,(%rdi)
+ jnz __write_lock_failed
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+/* rdi: pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ LOCK_PREFIX
+ incl (%rdi)
+1: rep
+ nop
+ cmpl $1,(%rdi)
+ js 1b
+ LOCK_PREFIX
+ decl (%rdi)
+ js __read_lock_failed
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index e49af0032e94..0025535cac8d 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -1,10 +1,9 @@
- /*
- * Save registers before calling assembly functions. This avoids
- * disturbance of register allocation in some inline assembly constructs.
- * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
- * Subject to the GNU public license, v.2. No warranty of any kind.
- * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
- */
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
#include <linux/config.h>
#include <linux/linkage.h>
@@ -47,6 +46,11 @@
thunk_retrax __down_failed_interruptible,__down_interruptible
thunk_retrax __down_failed_trylock,__down_trylock
thunk __up_wakeup,__up
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+ thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
@@ -62,33 +66,3 @@ restore_norax:
RESTORE_ARGS 1
ret
CFI_ENDPROC
-
-#ifdef CONFIG_SMP
-/* Support for read/write spinlocks. */
- .text
-/* rax: pointer to rwlock_t */
-ENTRY(__write_lock_failed)
- lock
- addl $RW_LOCK_BIAS,(%rax)
-1: rep
- nop
- cmpl $RW_LOCK_BIAS,(%rax)
- jne 1b
- lock
- subl $RW_LOCK_BIAS,(%rax)
- jnz __write_lock_failed
- ret
-
-/* rax: pointer to rwlock_t */
-ENTRY(__read_lock_failed)
- lock
- incl (%rax)
-1: rep
- nop
- cmpl $1,(%rax)
- js 1b
- lock
- decl (%rax)
- js __read_lock_failed
- ret
-#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
index 9bc2c295818e..893d43f838cc 100644
--- a/arch/x86_64/lib/usercopy.c
+++ b/arch/x86_64/lib/usercopy.c
@@ -5,6 +5,7 @@
* Copyright 1997 Linus Torvalds
* Copyright 2002 Andi Kleen <ak@suse.de>
*/
+#include <linux/module.h>
#include <asm/uaccess.h>
/*
@@ -47,15 +48,17 @@ __strncpy_from_user(char *dst, const char __user *src, long count)
__do_strncpy_from_user(dst, src, count, res);
return res;
}
+EXPORT_SYMBOL(__strncpy_from_user);
long
strncpy_from_user(char *dst, const char __user *src, long count)
{
long res = -EFAULT;
if (access_ok(VERIFY_READ, src, 1))
- __do_strncpy_from_user(dst, src, count, res);
+ return __strncpy_from_user(dst, src, count);
return res;
}
+EXPORT_SYMBOL(strncpy_from_user);
/*
* Zero Userspace
@@ -94,7 +97,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)
[zero] "r" (0UL), [eight] "r" (8UL));
return size;
}
-
+EXPORT_SYMBOL(__clear_user);
unsigned long clear_user(void __user *to, unsigned long n)
{
@@ -102,6 +105,7 @@ unsigned long clear_user(void __user *to, unsigned long n)
return __clear_user(to, n);
return n;
}
+EXPORT_SYMBOL(clear_user);
/*
* Return the size of a string (including the ending 0)
@@ -125,6 +129,7 @@ long __strnlen_user(const char __user *s, long n)
s++;
}
}
+EXPORT_SYMBOL(__strnlen_user);
long strnlen_user(const char __user *s, long n)
{
@@ -132,6 +137,7 @@ long strnlen_user(const char __user *s, long n)
return 0;
return __strnlen_user(s, n);
}
+EXPORT_SYMBOL(strnlen_user);
long strlen_user(const char __user *s)
{
@@ -147,6 +153,7 @@ long strlen_user(const char __user *s)
s++;
}
}
+EXPORT_SYMBOL(strlen_user);
unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
{
@@ -155,3 +162,5 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le
}
return len;
}
+EXPORT_SYMBOL(copy_in_user);
+
diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c
index 2d78f9fb4035..79ac6e7100af 100644
--- a/arch/x86_64/mm/extable.c
+++ b/arch/x86_64/mm/extable.c
@@ -2,7 +2,6 @@
* linux/arch/x86_64/mm/extable.c
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/init.h>
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 55250593d8c9..3751b4788e28 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -5,7 +5,6 @@
* Copyright (C) 2001,2002 Andi Kleen, SuSE Labs.
*/
-#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -41,6 +40,35 @@
#define PF_RSVD (1<<3)
#define PF_INSTR (1<<4)
+static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
+
+/* Hook to register for page fault notifications */
+int register_page_fault_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_notifier);
+
+int unregister_page_fault_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_notifier);
+
+static inline int notify_page_fault(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig
+ };
+ return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
+}
+
void bust_spinlocks(int yes)
{
int loglevel_save = console_loglevel;
@@ -68,7 +96,7 @@ void bust_spinlocks(int yes)
static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
unsigned long error_code)
{
- unsigned char *instr;
+ unsigned char __user *instr;
int scan_more = 1;
int prefetch = 0;
unsigned char *max_instr;
@@ -77,7 +105,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
if (error_code & PF_INSTR)
return 0;
- instr = (unsigned char *)convert_rip_to_linear(current, regs);
+ instr = (unsigned char __user *)convert_rip_to_linear(current, regs);
max_instr = instr + 15;
if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
@@ -88,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
unsigned char instr_hi;
unsigned char instr_lo;
- if (__get_user(opcode, instr))
+ if (__get_user(opcode, (char __user *)instr))
break;
instr_hi = opcode & 0xf0;
@@ -126,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
case 0x00:
/* Prefetch instruction is 0x0F0D or 0x0F18 */
scan_more = 0;
- if (__get_user(opcode, instr))
+ if (__get_user(opcode, (char __user *)instr))
break;
prefetch = (instr_lo == 0xF) &&
(opcode == 0x0D || opcode == 0x18);
@@ -142,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr,
static int bad_address(void *p)
{
unsigned long dummy;
- return __get_user(dummy, (unsigned long *)p);
+ return __get_user(dummy, (unsigned long __user *)p);
}
void dump_pagetable(unsigned long address)
@@ -160,7 +188,7 @@ void dump_pagetable(unsigned long address)
printk("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd)) goto ret;
- pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address);
+ pud = pud_offset(pgd, address);
if (bad_address(pud)) goto bad;
printk("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud)) goto ret;
@@ -216,7 +244,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
int unhandled_signal(struct task_struct *tsk, int sig)
{
- if (tsk->pid == 1)
+ if (is_init(tsk))
return 1;
if (tsk->ptrace & PT_PTRACED)
return 0;
@@ -265,7 +293,7 @@ static int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
else
- BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
/* Below here mismatches are bugs because these lower tables
are shared */
@@ -274,7 +302,7 @@ static int vmalloc_fault(unsigned long address)
pud_ref = pud_offset(pgd_ref, address);
if (pud_none(*pud_ref))
return -1;
- if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref))
+ if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
BUG();
pmd = pmd_offset(pud, address);
pmd_ref = pmd_offset(pud_ref, address);
@@ -348,7 +376,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (vmalloc_fault(address) >= 0)
return;
}
- if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;
/*
@@ -358,7 +386,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
goto bad_area_nosemaphore;
}
- if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+ if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
SIGSEGV) == NOTIFY_STOP)
return;
@@ -383,7 +411,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
* kernel and should generate an OOPS. Unfortunatly, in the case of an
- * erroneous fault occuring in a code path which already holds mmap_sem
+ * erroneous fault occurring in a code path which already holds mmap_sem
* we will deadlock attempting to validate the fault against the
* address space. Luckily the kernel only validly references user
* space from well defined areas of code, which are listed in the
@@ -410,8 +438,10 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (!(vma->vm_flags & VM_GROWSDOWN))
goto bad_area;
if (error_code & 4) {
- // XXX: align red zone size with ABI
- if (address + 128 < regs->rsp)
+ /* Allow userspace just enough access below the stack pointer
+ * to let the 'enter' instruction work.
+ */
+ if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)
goto bad_area;
}
if (expand_stack(vma, address))
@@ -434,7 +464,7 @@ good_area:
case PF_PROT: /* read, present */
goto bad_area;
case 0: /* read, not present */
- if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
goto bad_area;
}
@@ -534,7 +564,6 @@ no_context:
printk(KERN_ALERT "Unable to handle kernel paging request");
printk(" at %016lx RIP: \n" KERN_ALERT,address);
printk_address(regs->rip);
- printk("\n");
dump_pagetable(address);
tsk->thread.cr2 = address;
tsk->thread.trap_no = 14;
@@ -551,7 +580,7 @@ no_context:
*/
out_of_memory:
up_read(&mm->mmap_sem);
- if (current->pid == 1) {
+ if (is_init(current)) {
yield();
goto again;
}
@@ -606,7 +635,7 @@ void vmalloc_sync_all(void)
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
else
- BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref));
+ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
spin_unlock(&pgd_lock);
set_bit(pgd_index(address), insync);
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 4ba34e95d835..3e16fe08150e 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -6,7 +6,6 @@
* Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
*/
-#include <linux/config.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -23,6 +22,7 @@
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
#include <linux/pci.h>
+#include <linux/poison.h>
#include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/memory_hotplug.h>
@@ -41,8 +41,6 @@
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/sections.h>
-#include <asm/dma-mapping.h>
-#include <asm/swiotlb.h>
#ifndef Dprintk
#define Dprintk(x...)
@@ -90,8 +88,6 @@ void show_mem(void)
printk(KERN_INFO "%lu pages swap cached\n",cached);
}
-/* References to section boundaries */
-
int after_bootmem;
static __init void *spp_getpage(void)
@@ -233,7 +229,6 @@ __init void *early_ioremap(unsigned long addr, unsigned long size)
/* actually usually some more */
if (size >= LARGE_PAGE_SIZE) {
- printk("SMBIOS area too long %lu\n", size);
return NULL;
}
set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
@@ -254,18 +249,24 @@ __init void early_iounmap(void *addr, unsigned long size)
}
static void __meminit
-phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
+phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
{
- int i;
+ int i = pmd_index(address);
- for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
+ for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
unsigned long entry;
+ pmd_t *pmd = pmd_page + pmd_index(address);
- if (address > end) {
- for (; i < PTRS_PER_PMD; i++, pmd++)
- set_pmd(pmd, __pmd(0));
+ if (address >= end) {
+ if (!after_bootmem)
+ for (; i < PTRS_PER_PMD; i++, pmd++)
+ set_pmd(pmd, __pmd(0));
break;
}
+
+ if (pmd_val(*pmd))
+ continue;
+
entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
entry &= __supported_pte_mask;
set_pmd(pmd, __pmd(entry));
@@ -275,45 +276,41 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
static void __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
- pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
-
- if (pmd_none(*pmd)) {
- spin_lock(&init_mm.page_table_lock);
- phys_pmd_init(pmd, address, end);
- spin_unlock(&init_mm.page_table_lock);
- __flush_tlb_all();
- }
+ pmd_t *pmd = pmd_offset(pud,0);
+ spin_lock(&init_mm.page_table_lock);
+ phys_pmd_init(pmd, address, end);
+ spin_unlock(&init_mm.page_table_lock);
+ __flush_tlb_all();
}
-static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
- long i = pud_index(address);
+ int i = pud_index(addr);
- pud = pud + i;
- if (after_bootmem && pud_val(*pud)) {
- phys_pmd_update(pud, address, end);
- return;
- }
-
- for (; i < PTRS_PER_PUD; pud++, i++) {
+ for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
int map;
- unsigned long paddr, pmd_phys;
+ unsigned long pmd_phys;
+ pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
- paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
- if (paddr >= end)
+ if (addr >= end)
break;
- if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) {
+ if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
set_pud(pud, __pud(0));
continue;
}
+ if (pud_val(*pud)) {
+ phys_pmd_update(pud, addr, end);
+ continue;
+ }
+
pmd = alloc_low_page(&map, &pmd_phys);
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- phys_pmd_init(pmd, paddr, end);
+ phys_pmd_init(pmd, addr, end);
spin_unlock(&init_mm.page_table_lock);
unmap_low_page(map);
}
@@ -341,7 +338,8 @@ static void __init find_early_table_space(unsigned long end)
table_end = table_start;
early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
- end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
+ end, table_start << PAGE_SHIFT,
+ (table_start << PAGE_SHIFT) + tables);
}
/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
@@ -372,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
pud_t *pud;
if (after_bootmem)
- pud = pud_offset_k(pgd, start & PGDIR_MASK);
+ pud = pud_offset(pgd, start & PGDIR_MASK);
else
pud = alloc_low_page(&map, &pud_phys);
@@ -405,69 +403,15 @@ void __cpuinit zap_low_mappings(int cpu)
__flush_tlb_all();
}
-/* Compute zone sizes for the DMA and DMA32 zones in a node. */
-__init void
-size_zones(unsigned long *z, unsigned long *h,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int i;
- unsigned long w;
-
- for (i = 0; i < MAX_NR_ZONES; i++)
- z[i] = 0;
-
- if (start_pfn < MAX_DMA_PFN)
- z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
- if (start_pfn < MAX_DMA32_PFN) {
- unsigned long dma32_pfn = MAX_DMA32_PFN;
- if (dma32_pfn > end_pfn)
- dma32_pfn = end_pfn;
- z[ZONE_DMA32] = dma32_pfn - start_pfn;
- }
- z[ZONE_NORMAL] = end_pfn - start_pfn;
-
- /* Remove lower zones from higher ones. */
- w = 0;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (z[i])
- z[i] -= w;
- w += z[i];
- }
-
- /* Compute holes */
- w = start_pfn;
- for (i = 0; i < MAX_NR_ZONES; i++) {
- unsigned long s = w;
- w += z[i];
- h[i] = e820_hole_size(s, w);
- }
-
- /* Add the space pace needed for mem_map to the holes too. */
- for (i = 0; i < MAX_NR_ZONES; i++)
- h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
-
- /* The 16MB DMA zone has the kernel and other misc mappings.
- Account them too */
- if (h[ZONE_DMA]) {
- h[ZONE_DMA] += dma_reserve;
- if (h[ZONE_DMA] >= z[ZONE_DMA]) {
- printk(KERN_WARNING
- "Kernel too large and filling up ZONE_DMA?\n");
- h[ZONE_DMA] = z[ZONE_DMA];
- }
- }
-}
-
#ifndef CONFIG_NUMA
void __init paging_init(void)
{
- unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
-
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN,
+ MAX_DMA32_PFN,
+ end_pfn};
memory_present(0, 0, end_pfn);
sparse_init();
- size_zones(zones, holes, 0, end_pfn);
- free_area_init_node(0, NODE_DATA(0), zones,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
+ free_area_init_nodes(max_zone_pfns);
}
#endif
@@ -508,8 +452,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
/*
* Memory hotplug specific functions
*/
-#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)
-
void online_page(struct page *page)
{
ClearPageReserved(page);
@@ -519,31 +461,17 @@ void online_page(struct page *page)
num_physpages++;
}
-#ifndef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
- * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
- * just online the pages.
+ * XXX: memory_add_physaddr_to_nid() is to find node id from physical address
+ * via probe interface of sysfs. If acpi notifies hot-add event, then it
+ * can tell node id by searching dsdt. But, probe interface doesn't have
+ * node id. So, return 0 as node id at this time.
*/
-int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+#ifdef CONFIG_NUMA
+int memory_add_physaddr_to_nid(u64 start)
{
- int err = -EIO;
- unsigned long pfn;
- unsigned long total = 0, mem = 0;
- for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
- if (pfn_valid(pfn)) {
- online_page(pfn_to_page(pfn));
- err = 0;
- mem++;
- }
- total++;
- }
- if (!err) {
- z->spanned_pages += total;
- z->present_pages += mem;
- z->zone_pgdat->node_spanned_pages += total;
- z->zone_pgdat->node_present_pages += mem;
- }
- return err;
+ return 0;
}
#endif
@@ -551,10 +479,10 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
-int add_memory(u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size)
{
- struct pglist_data *pgdat = NODE_DATA(0);
- struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
+ struct pglist_data *pgdat = NODE_DATA(nid);
+ struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
@@ -570,7 +498,7 @@ error:
printk("%s: Problem encountered in __add_pages!\n", __func__);
return ret;
}
-EXPORT_SYMBOL_GPL(add_memory);
+EXPORT_SYMBOL_GPL(arch_add_memory);
int remove_memory(u64 start, u64 size)
{
@@ -578,7 +506,33 @@ int remove_memory(u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(remove_memory);
-#endif
+#else /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance,
+ * just online the pages.
+ */
+int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages)
+{
+ int err = -EIO;
+ unsigned long pfn;
+ unsigned long total = 0, mem = 0;
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+ if (pfn_valid(pfn)) {
+ online_page(pfn_to_page(pfn));
+ err = 0;
+ mem++;
+ }
+ total++;
+ }
+ if (!err) {
+ z->spanned_pages += total;
+ z->present_pages += mem;
+ z->zone_pgdat->node_spanned_pages += total;
+ z->zone_pgdat->node_present_pages += mem;
+ }
+ return err;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
kcore_vsyscall;
@@ -587,16 +541,7 @@ void __init mem_init(void)
{
long codesize, reservedpages, datasize, initsize;
-#ifdef CONFIG_SWIOTLB
- pci_swiotlb_init();
-#endif
- no_iommu_init();
-
- /* How many end-of-memory variables you have, grandma! */
- max_low_pfn = end_pfn;
- max_pfn = end_pfn;
- num_physpages = end_pfn;
- high_memory = (void *) __va(end_pfn * PAGE_SIZE);
+ pci_iommu_alloc();
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
@@ -609,7 +554,8 @@ void __init mem_init(void)
#else
totalram_pages = free_all_bootmem();
#endif
- reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
+ reservedpages = end_pfn - totalram_pages -
+ absent_pages_in_range(0, end_pfn);
after_bootmem = 1;
@@ -644,34 +590,44 @@ void __init mem_init(void)
#endif
}
-void free_initmem(void)
+void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr;
- addr = (unsigned long)(&__init_begin);
- for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+ if (begin >= end)
+ return;
+
+ printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10);
+ for (addr = begin; addr < end; addr += PAGE_SIZE) {
ClearPageReserved(virt_to_page(addr));
init_page_count(virt_to_page(addr));
- memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
+ memset((void *)(addr & ~(PAGE_SIZE-1)),
+ POISON_FREE_INITMEM, PAGE_SIZE);
free_page(addr);
totalram_pages++;
}
- memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
- printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
+}
+
+void free_initmem(void)
+{
+ memset(__initdata_begin, POISON_FREE_INITDATA,
+ __initdata_end - __initdata_begin);
+ free_init_pages("unused kernel memory",
+ (unsigned long)(&__init_begin),
+ (unsigned long)(&__init_end));
}
#ifdef CONFIG_DEBUG_RODATA
-extern char __start_rodata, __end_rodata;
void mark_rodata_ro(void)
{
- unsigned long addr = (unsigned long)&__start_rodata;
+ unsigned long addr = (unsigned long)__start_rodata;
- for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE)
+ for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE)
change_page_attr_addr(addr, 1, PAGE_KERNEL_RO);
printk ("Write protecting the kernel read-only data: %luk\n",
- (&__end_rodata - &__start_rodata) >> 10);
+ (__end_rodata - __start_rodata) >> 10);
/*
* change_page_attr_addr() requires a global_flush_tlb() call after it.
@@ -686,15 +642,7 @@ void mark_rodata_ro(void)
#ifdef CONFIG_BLK_DEV_INITRD
void free_initrd_mem(unsigned long start, unsigned long end)
{
- if (start >= end)
- return;
- printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
- for (; start < end; start += PAGE_SIZE) {
- ClearPageReserved(virt_to_page(start));
- init_page_count(virt_to_page(start));
- free_page(start);
- totalram_pages++;
- }
+ free_init_pages("initrd memory", start, end);
}
#endif
@@ -707,8 +655,10 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
#else
reserve_bootmem(phys, len);
#endif
- if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
+ if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
+ set_dma_reserve(dma_reserve);
+ }
}
int kern_addr_valid(unsigned long addr)
diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c
index ae207064201e..45d7d823c3b8 100644
--- a/arch/x86_64/mm/ioremap.c
+++ b/arch/x86_64/mm/ioremap.c
@@ -11,6 +11,7 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/module.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
@@ -219,6 +220,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l
}
return (__force void __iomem *) (offset + (char *)addr);
}
+EXPORT_SYMBOL(__ioremap);
/**
* ioremap_nocache - map bus memory into CPU space
@@ -246,6 +248,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
{
return __ioremap(phys_addr, size, _PAGE_PCD);
}
+EXPORT_SYMBOL(ioremap_nocache);
/**
* iounmap - Free a IO remapping
@@ -291,3 +294,5 @@ void iounmap(volatile void __iomem *addr)
BUG_ON(p != o || o == NULL);
kfree(p);
}
+EXPORT_SYMBOL(iounmap);
+
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index 7c45c2d2b8b2..b5b8dba28b4e 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -54,6 +54,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
nodes_clear(nodes_parsed);
+ if (!early_pci_allowed())
+ return -1;
+
nb = find_northbridge();
if (nb < 0)
return nb;
@@ -146,6 +149,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
nodes[nodeid].start = base;
nodes[nodeid].end = limit;
+ e820_register_active_regions(nodeid,
+ nodes[nodeid].start >> PAGE_SHIFT,
+ nodes[nodeid].end >> PAGE_SHIFT);
prevbase = base;
diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c
index 43e9b99bdf25..80bba0dc000e 100644
--- a/arch/x86_64/mm/mmap.c
+++ b/arch/x86_64/mm/mmap.c
@@ -1,7 +1,6 @@
/* Copyright 2005 Andi Kleen, SuSE Labs.
* Licensed under GPL, v.2
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/random.h>
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index b2fac14baac0..829a008bd39b 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
bootmap_start >> PAGE_SHIFT,
start_pfn, end_pfn);
- e820_bootmem_free(NODE_DATA(nodeid), start, end);
+ free_bootmem_with_active_regions(nodeid, end);
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
@@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
void __init setup_node_zones(int nodeid)
{
unsigned long start_pfn, end_pfn, memmapsize, limit;
- unsigned long zones[MAX_NR_ZONES];
- unsigned long holes[MAX_NR_ZONES];
start_pfn = node_start_pfn(nodeid);
end_pfn = node_end_pfn(nodeid);
- Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
+ Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
nodeid, start_pfn, end_pfn);
/* Try to allocate mem_map at end to not fill up precious <4GB
@@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid)
round_down(limit - memmapsize, PAGE_SIZE),
limit);
#endif
-
- size_zones(zones, holes, start_pfn, end_pfn);
- free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
- start_pfn, holes);
}
void __init numa_init_array(void)
@@ -225,7 +219,7 @@ void __init numa_init_array(void)
int numa_fake __initdata = 0;
/* Numa emulation */
-static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
+static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
int i;
struct bootnode nodes[MAX_NUMNODES];
@@ -259,8 +253,11 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
return -1;
}
- for_each_online_node(i)
+ for_each_online_node(i) {
+ e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
+ nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
+ }
numa_init_array();
return 0;
}
@@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
node_to_cpumask[0] = cpumask_of_cpu(0);
+ e820_register_active_regions(0, start_pfn, end_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
@@ -340,17 +338,23 @@ static void __init arch_sparse_init(void)
void __init paging_init(void)
{
int i;
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN,
+ MAX_DMA32_PFN,
+ end_pfn};
arch_sparse_init();
for_each_online_node(i) {
setup_node_zones(i);
}
+
+ free_area_init_nodes(max_zone_pfns);
}
-/* [numa=off] */
-__init int numa_setup(char *opt)
+static __init int numa_setup(char *opt)
{
+ if (!opt)
+ return -EINVAL;
if (!strncmp(opt,"off",3))
numa_off = 1;
#ifdef CONFIG_NUMA_EMU
@@ -366,9 +370,11 @@ __init int numa_setup(char *opt)
if (!strncmp(opt,"hotadd=", 7))
hotadd_percent = simple_strtoul(opt+7, NULL, 10);
#endif
- return 1;
+ return 0;
}
+early_param("numa", numa_setup);
+
/*
* Setup early cpu_to_node.
*
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 531ad21447b1..3e231d762aaa 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -3,7 +3,6 @@
* Thanks to Ben LaHaise for precious feedback.
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/highmem.h>
@@ -109,8 +108,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
BUG_ON(pud_none(*pud));
pmd = pmd_offset(pud, address);
BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
- pgprot_val(ref_prot) |= _PAGE_PSE;
large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot);
+ large_pte = pte_mkhuge(large_pte);
set_pte((pte_t *)pmd, large_pte);
}
@@ -120,32 +119,28 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
{
pte_t *kpte;
struct page *kpte_page;
- unsigned kpte_flags;
pgprot_t ref_prot2;
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
- kpte_flags = pte_val(*kpte);
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
- if ((kpte_flags & _PAGE_PSE) == 0) {
+ if (!pte_huge(*kpte)) {
set_pte(kpte, pfn_pte(pfn, prot));
} else {
/*
* split_large_page will take the reference for this
* change_page_attr on the split page.
*/
-
struct page *split;
- ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE));
-
+ ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
split = split_large_page(address, prot, ref_prot2);
if (!split)
return -ENOMEM;
- set_pte(kpte,mk_pte(split, ref_prot2));
+ set_pte(kpte, mk_pte(split, ref_prot2));
kpte_page = split;
- }
+ }
page_private(kpte_page)++;
- } else if ((kpte_flags & _PAGE_PSE) == 0) {
+ } else if (!pte_huge(*kpte)) {
set_pte(kpte, pfn_pte(pfn, ref_prot));
BUG_ON(page_private(kpte_page) == 0);
page_private(kpte_page)--;
@@ -191,10 +186,12 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
* lowmem */
if (__pa(address) < KERNEL_TEXT_SIZE) {
unsigned long addr2;
- pgprot_t prot2 = prot;
+ pgprot_t prot2;
addr2 = __START_KERNEL_map + __pa(address);
- pgprot_val(prot2) &= ~_PAGE_NX;
- err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC);
+ /* Make sure the kernel mappings stay executable */
+ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
+ err = __change_page_attr(addr2, pfn, prot2,
+ PAGE_KERNEL_EXEC);
}
}
up_write(&init_mm.mmap_sem);
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 502fce65e96a..f8c04d6935c9 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -21,6 +21,8 @@
#include <asm/numa.h>
#include <asm/e820.h>
+int acpi_numa __initdata;
+
#if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
&& !defined(CONFIG_MEMORY_HOTPLUG)
@@ -91,6 +93,7 @@ static __init void bad_srat(void)
apicid_to_node[i] = NUMA_NO_NODE;
for (i = 0; i < MAX_NUMNODES; i++)
nodes_add[i].start = nodes[i].end = 0;
+ remove_all_active_ranges();
}
static __init inline int srat_disabled(void)
@@ -173,7 +176,7 @@ static int hotadd_enough_memory(struct bootnode *nd)
if (mem < 0)
return 0;
- allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
+ allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE;
allowed = (allowed / 100) * hotadd_percent;
if (allocated + mem > allowed) {
unsigned long range;
@@ -223,8 +226,10 @@ static int reserve_hotadd(int node, unsigned long start, unsigned long end)
}
/* This check might be a bit too strict, but I'm keeping it for now. */
- if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
- printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
+ if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
+ printk(KERN_ERR
+ "SRAT: Hotplug area %lu -> %lu has existing memory\n",
+ s_pfn, e_pfn);
return -1;
}
@@ -317,6 +322,10 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
nd->start, nd->end);
+ e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
+ nd->end >> PAGE_SHIFT);
+ push_node_boundaries(node, nd->start >> PAGE_SHIFT,
+ nd->end >> PAGE_SHIFT);
#ifdef RESERVE_HOTADD
if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
@@ -341,13 +350,13 @@ static int nodes_cover_memory(void)
unsigned long s = nodes[i].start >> PAGE_SHIFT;
unsigned long e = nodes[i].end >> PAGE_SHIFT;
pxmram += e - s;
- pxmram -= e820_hole_size(s, e);
+ pxmram -= absent_pages_in_range(s, e);
pxmram -= nodes_add[i].end - nodes_add[i].start;
if ((long)pxmram < 0)
pxmram = 0;
}
- e820ram = end_pfn - e820_hole_size(0, end_pfn);
+ e820ram = end_pfn - absent_pages_in_range(0, end_pfn);
/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
if ((long)(e820ram - pxmram) >= 1*1024*1024) {
printk(KERN_ERR
diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile
index a3f6ad570179..1eb18f421edf 100644
--- a/arch/x86_64/pci/Makefile
+++ b/arch/x86_64/pci/Makefile
@@ -9,7 +9,7 @@ obj-y := i386.o
obj-$(CONFIG_PCI_DIRECT)+= direct.o
obj-y += fixup.o init.o
obj-$(CONFIG_ACPI) += acpi.o
-obj-y += legacy.o irq.o common.o
+obj-y += legacy.o irq.o common.o early.o
# mmconfig has a 64bit special
obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o
@@ -23,3 +23,4 @@ common-y += ../../i386/pci/common.o
fixup-y += ../../i386/pci/fixup.o
i386-y += ../../i386/pci/i386.o
init-y += ../../i386/pci/init.o
+early-y += ../../i386/pci/early.o
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index 3c55c76c6fd5..7732f4254d21 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -156,15 +156,45 @@ static __init void unreachable_devices(void)
addr = pci_dev_base(0, k, PCI_DEVFN(i, 0));
if (addr == NULL|| readl(addr) != val1) {
set_bit(i + 32*k, fallback_slots);
- printk(KERN_NOTICE
- "PCI: No mmconfig possible on device %x:%x\n",
- k, i);
+ printk(KERN_NOTICE "PCI: No mmconfig possible"
+ " on device %02x:%02x\n", k, i);
}
}
}
}
-void __init pci_mmcfg_init(void)
+static __init void pci_mmcfg_insert_resources(void)
+{
+#define PCI_MMCFG_RESOURCE_NAME_LEN 19
+ int i;
+ struct resource *res;
+ char *names;
+ unsigned num_buses;
+
+ res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
+ pci_mmcfg_config_num, GFP_KERNEL);
+
+ if (!res) {
+ printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
+ return;
+ }
+
+ names = (void *)&res[pci_mmcfg_config_num];
+ for (i = 0; i < pci_mmcfg_config_num; i++, res++) {
+ num_buses = pci_mmcfg_config[i].end_bus_number -
+ pci_mmcfg_config[i].start_bus_number + 1;
+ res->name = names;
+ snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u",
+ pci_mmcfg_config[i].pci_segment_group_number);
+ res->start = pci_mmcfg_config[i].base_address;
+ res->end = res->start + (num_buses << 20) - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ insert_resource(&iomem_resource, res);
+ names += PCI_MMCFG_RESOURCE_NAME_LEN;
+ }
+}
+
+void __init pci_mmcfg_init(int type)
{
int i;
@@ -177,7 +207,9 @@ void __init pci_mmcfg_init(void)
(pci_mmcfg_config[0].base_address == 0))
return;
- if (!e820_all_mapped(pci_mmcfg_config[0].base_address,
+ /* Only do this check when type 1 works. If it doesn't work
+ assume we run on a Mac and always use MCFG */
+ if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address,
pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN,
E820_RESERVED)) {
printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n",
@@ -186,7 +218,6 @@ void __init pci_mmcfg_init(void)
return;
}
- /* RED-PEN i386 doesn't do _nocache right now */
pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL);
if (pci_mmcfg_virt == NULL) {
printk("PCI: Can not allocate memory for mmconfig structures\n");
@@ -205,6 +236,7 @@ void __init pci_mmcfg_init(void)
}
unreachable_devices();
+ pci_mmcfg_insert_resources();
raw_pci_ops = &pci_mmcfg;
pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
OpenPOWER on IntegriCloud