diff options
Diffstat (limited to 'arch/x86_64')
112 files changed, 6531 insertions, 3561 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index af44130f0d65..32ae1378f35c 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -24,6 +24,18 @@ config X86 bool default y +config ZONE_DMA32 + bool + default y + +config LOCKDEP_SUPPORT + bool + default y + +config STACKTRACE_SUPPORT + bool + default y + config SEMAPHORE_SLEEPERS bool default y @@ -73,10 +85,17 @@ config ARCH_MAY_HAVE_PC_FDC bool default y +config ARCH_POPULATES_NODE_MAP + def_bool y + config DMI bool default y +config AUDIT_ARCH + bool + default y + source "init/Kconfig" @@ -93,6 +112,7 @@ config X86_PC config X86_VSMP bool "Support for ScaleMP vSMP" + depends on PCI help Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is supposed to run on these EM64T-based machines. Only choose this option @@ -151,6 +171,7 @@ config X86_GOOD_APIC config MICROCODE tristate "/dev/cpu/microcode - Intel CPU microcode support" + select FW_LOADER ---help--- If you say Y here the 'File systems' section, you will be able to update the microcode on Intel processors. You will @@ -166,6 +187,11 @@ config MICROCODE If you use modprobe or kmod you may also want to add the line 'alias char-major-10-184 microcode' to your /etc/modules.conf file. +config MICROCODE_OLD_INTERFACE + bool + depends on MICROCODE + default y + config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" help @@ -279,7 +305,7 @@ config NUMA config K8_NUMA bool "Old style AMD Opteron NUMA detection" - depends on NUMA + depends on NUMA && PCI default y help Enable K8 NUMA node topology detection. You should say Y here if @@ -370,6 +396,8 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config ARCH_ENABLE_MEMORY_HOTPLUG + def_bool y config HPET_TIMER bool @@ -386,24 +414,44 @@ config HPET_EMULATE_RTC bool "Provide RTC interrupt" depends on HPET_TIMER && RTC=y -config GART_IOMMU - bool "K8 GART IOMMU support" +# Mark as embedded because too many people got it wrong. +# The code disables itself when not needed. +config IOMMU + bool "IOMMU support" if EMBEDDED default y select SWIOTLB select AGP depends on PCI help - Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors - and for the bounce buffering software IOMMU. - Needed to run systems with more than 3GB of memory properly with - 32-bit PCI devices that do not support DAC (Double Address Cycle). - The IOMMU can be turned off at runtime with the iommu=off parameter. - Normally the kernel will take the right choice by itself. - This option includes a driver for the AMD Opteron/Athlon64 IOMMU - northbridge and a software emulation used on other systems without - hardware IOMMU. If unsure, say Y. - -# need this always selected by GART_IOMMU for the VIA workaround + Support for full DMA access of devices with 32bit memory access only + on systems with more than 3GB. This is usually needed for USB, + sound, many IDE/SATA chipsets and some other devices. + Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART + based IOMMU and a software bounce buffer based IOMMU used on Intel + systems and as fallback. + The code is only active when needed (enough memory and limited + device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified + too. + +config CALGARY_IOMMU + bool "IBM Calgary IOMMU support" + select SWIOTLB + depends on PCI && EXPERIMENTAL + help + Support for hardware IOMMUs in IBM's xSeries x366 and x460 + systems. Needed to run systems with more than 3GB of memory + properly with 32-bit PCI devices that do not support DAC + (Double Address Cycle). Calgary also supports bus level + isolation, where all DMAs pass through the IOMMU. This + prevents them from going anywhere except their intended + destination. This catches hard-to-find kernel bugs and + mis-behaving drivers and devices that do not use the DMA-API + properly to set up their DMA buffers. The IOMMU can be + turned off at boot time with the iommu=off parameter. + Normally the kernel will make the right choice by itself. + If unsure, say Y. + +# need this always selected by IOMMU for the VIA workaround config SWIOTLB bool @@ -433,15 +481,14 @@ config X86_MCE_AMD the DRAM Error Threshold. config KEXEC - bool "kexec system call (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "kexec system call" help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot - but it is indepedent of the system firmware. And like a reboot + but it is independent of the system firmware. And like a reboot you can start any kernel with it, not just Linux. - The name comes from the similiarity to the exec system call. + The name comes from the similarity to the exec system call. It is an ongoing process to be certain the hardware in a machine is properly shutdown, so do not be surprised if this code does not @@ -453,7 +500,14 @@ config CRASH_DUMP bool "kernel crash dumps (EXPERIMENTAL)" depends on EXPERIMENTAL help - Generate crash dump after being started by kexec. + Generate crash dump after being started by kexec. + This should be normally only set in special crash dump kernels + which are loaded in the main kernel with kexec-tools into + a specially reserved region and then later executed after + a crash by kdump/kexec. The crash dump kernel must be compiled + to a memory address not used by the main kernel or BIOS using + PHYSICAL_START. + For more details see Documentation/kdump/kdump.txt config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) @@ -491,6 +545,30 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +config CC_STACKPROTECTOR + bool "Enable -fstack-protector buffer overflow detection (EXPRIMENTAL)" + depends on EXPERIMENTAL + help + This option turns on the -fstack-protector GCC feature. This + feature puts, at the beginning of critical functions, a canary + value on the stack just before the return address, and validates + the value just before actually returning. Stack based buffer + overflows (that need to overwrite this return address) now also + overwrite the canary, which gets detected and the attack is then + neutralized via a kernel panic. + + This feature requires gcc version 4.2 or above, or a distribution + gcc with the feature backported. Older versions are automatically + detected and for those versions, this configuration option is ignored. + +config CC_STACKPROTECTOR_ALL + bool "Use stack-protector for all functions" + depends on CC_STACKPROTECTOR + help + Normally, GCC only inserts the canary value protection for + functions that use large-ish on-stack buffers. By enabling + this option, GCC will be asked to do this for ALL functions. + source kernel/Kconfig.hz config REORDER @@ -501,6 +579,10 @@ config REORDER optimal TLB usage. If you have pretty much any version of binutils, this can increase your kernel build time by roughly one minute. +config K8_NB + def_bool y + depends on AGP_AMD64 || IOMMU || (PCI && NUMA) + endmenu # diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug index ea31b4c62105..775d211a5cf9 100644 --- a/arch/x86_64/Kconfig.debug +++ b/arch/x86_64/Kconfig.debug @@ -1,5 +1,9 @@ menu "Kernel hacking" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + source "lib/Kconfig.debug" config DEBUG_RODATA @@ -13,7 +17,7 @@ config DEBUG_RODATA If in doubt, say "N". config IOMMU_DEBUG - depends on GART_IOMMU && DEBUG_KERNEL + depends on IOMMU && DEBUG_KERNEL bool "Enable IOMMU debugging" help Force the IOMMU to on even when you have less than 4GB of @@ -35,6 +39,22 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config DEBUG_STACKOVERFLOW + bool "Check for stack overflows" + depends on DEBUG_KERNEL + help + This option will cause messages to be printed if free stack space + drops below a certain limit. + +config DEBUG_STACK_USAGE + bool "Stack utilization instrumentation" + depends on DEBUG_KERNEL + help + Enables the display of the minimum amount of free stack which each + task has ever had available in the sysrq-T and sysrq-P debug output. + + This option will slow down process creation somewhat. + #config X86_REMOTE_DEBUG # bool "kgdb debugging stub" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index e573e2ab5510..1c0f18d4f887 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -27,6 +27,7 @@ LDFLAGS_vmlinux := CHECKFLAGS += -D__x86_64__ -m64 cflags-y := +cflags-kernel-y := cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) @@ -35,7 +36,7 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-$(CONFIG_REORDER) += -ffunction-sections +cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections # this makes reading assembly source easier, but produces worse code # actually it makes the kernel smaller too. cflags-y += -fno-reorder-blocks @@ -53,8 +54,19 @@ endif cflags-y += $(call cc-option,-funit-at-a-time) # prevent gcc from generating any FP code by mistake cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) +# do binutils support CFI? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) + +# is .cfi_signal_frame supported too? +cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) +AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) + +cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) +cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) CFLAGS += $(cflags-y) +CFLAGS_KERNEL += $(cflags-kernel-y) AFLAGS += -m64 head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index 43ee6c50c277..deb063e7762d 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -107,8 +107,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf isoimage: $(BOOTIMAGE) -rm -rf $(obj)/isoimage mkdir $(obj)/isoimage - cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \ - $(obj)/isoimage + for i in lib lib64 share end ; do \ + if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ + cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ + break ; \ + fi ; \ + if [ $$i = end ] ; then exit 1 ; fi ; \ + done cp $(BOOTIMAGE) $(obj)/isoimage/linux echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg if [ -f '$(FDINITRD)' ] ; then \ diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index f89d96f11a9f..e70fa6e1da08 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -7,7 +7,8 @@ # targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o -EXTRA_AFLAGS := -traditional -m32 +EXTRA_AFLAGS := -traditional +AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index cf4b88c416dc..3755b2e394d0 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -77,11 +77,11 @@ static void gzip_release(void **); */ static unsigned char *real_mode; /* Pointer to real-mode data */ -#define EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) +#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) #ifndef STANDARD_MEMORY_BIOS_CALL -#define ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) +#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) #endif -#define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) +#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) extern unsigned char input_data[]; extern int input_len; @@ -92,9 +92,9 @@ static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); - -void* memset(void* s, int c, unsigned n); -void* memcpy(void* dest, const void* src, unsigned n); + +static void *memset(void *s, int c, unsigned n); +static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); @@ -162,8 +162,8 @@ static void putstr(const char *s) int x,y,pos; char c; - x = SCREEN_INFO.orig_x; - y = SCREEN_INFO.orig_y; + x = RM_SCREEN_INFO.orig_x; + y = RM_SCREEN_INFO.orig_y; while ( ( c = *s++ ) != '\0' ) { if ( c == '\n' ) { @@ -184,8 +184,8 @@ static void putstr(const char *s) } } - SCREEN_INFO.orig_x = x; - SCREEN_INFO.orig_y = y; + RM_SCREEN_INFO.orig_x = x; + RM_SCREEN_INFO.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ outb_p(14, vidport); @@ -194,7 +194,7 @@ static void putstr(const char *s) outb_p(0xff & (pos >> 1), vidport+1); } -void* memset(void* s, int c, unsigned n) +static void* memset(void* s, int c, unsigned n) { int i; char *ss = (char*)s; @@ -203,7 +203,7 @@ void* memset(void* s, int c, unsigned n) return s; } -void* memcpy(void* dest, const void* src, unsigned n) +static void* memcpy(void* dest, const void* src, unsigned n) { int i; char *d = (char *)dest, *s = (char *)src; @@ -278,15 +278,15 @@ static void error(char *x) putstr(x); putstr("\n\n -- System halted"); - while(1); + while(1); /* Halt */ } -void setup_normal_output_buffer(void) +static void setup_normal_output_buffer(void) { #ifdef STANDARD_MEMORY_BIOS_CALL - if (EXT_MEM_K < 1024) error("Less than 2MB of memory"); + if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); #else - if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); #endif output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ free_mem_end_ptr = (long)real_mode; @@ -297,13 +297,13 @@ struct moveparams { uch *high_buffer_start; int hcount; }; -void setup_output_buffer_if_we_run_high(struct moveparams *mv) +static void setup_output_buffer_if_we_run_high(struct moveparams *mv) { high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); #ifdef STANDARD_MEMORY_BIOS_CALL - if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); + if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); #else - if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); + if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); #endif mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX @@ -319,7 +319,7 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv) mv->high_buffer_start = high_buffer_start; } -void close_output_buffer_if_we_run_high(struct moveparams *mv) +static void close_output_buffer_if_we_run_high(struct moveparams *mv) { if (bytes_out > low_buffer_size) { mv->lcount = low_buffer_size; @@ -335,7 +335,7 @@ int decompress_kernel(struct moveparams *mv, void *rmode) { real_mode = rmode; - if (SCREEN_INFO.orig_video_mode == 7) { + if (RM_SCREEN_INFO.orig_video_mode == 7) { vidmem = (char *) 0xb0000; vidport = 0x3b4; } else { @@ -343,8 +343,8 @@ int decompress_kernel(struct moveparams *mv, void *rmode) vidport = 0x3d4; } - lines = SCREEN_INFO.orig_video_lines; - cols = SCREEN_INFO.orig_video_cols; + lines = RM_SCREEN_INFO.orig_video_lines; + cols = RM_SCREEN_INFO.orig_video_cols; if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); else setup_output_buffer_if_we_run_high(mv); diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index 12ea0b6c52e2..c3bfd223ab49 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -45,9 +45,8 @@ * Added long mode checking and SSE force. March 2003, Andi Kleen. */ -#include <linux/config.h> #include <asm/segment.h> -#include <linux/version.h> +#include <linux/utsrelease.h> #include <linux/compile.h> #include <asm/boot.h> #include <asm/e820.h> @@ -527,12 +526,12 @@ is_disk1: movw %cs, %ax # aka SETUPSEG subw $DELTA_INITSEG, %ax # aka INITSEG movw %ax, %ds - movw $0, (0x1ff) # default is no pointing device + movb $0, (0x1ff) # default is no pointing device int $0x11 # int 0x11: equipment list testb $0x04, %al # check if mouse installed jz no_psmouse - movw $0xAA, (0x1ff) # device present + movb $0xAA, (0x1ff) # device present no_psmouse: #include "../../i386/boot/edd.S" diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c44f5e2ec100..eae86691709a 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -149,10 +149,8 @@ int main(int argc, char ** argv) sz = sb.st_size; fprintf (stderr, "System is %d kB\n", sz/1024); sys_size = (sz + 15) / 16; - /* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ - if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) - die("System is too big. Try using %smodules.", - is_big_kernel ? "" : "bzImage or "); + if (!is_big_kernel && sys_size > DEF_SYSSIZE) + die("System is too big. Try using bzImage or modules."); while (sz > 0) { int l, n; diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S index 32327bb37aff..2aa565c136e5 100644 --- a/arch/x86_64/boot/video.S +++ b/arch/x86_64/boot/video.S @@ -1929,6 +1929,7 @@ skip10: movb %ah, %al ret store_edid: +#ifdef CONFIG_FIRMWARE_EDID pushw %es # just save all registers pushw %ax pushw %bx @@ -1946,6 +1947,22 @@ store_edid: rep stosl + pushw %es # save ES + xorw %di, %di # Report Capability + pushw %di + popw %es # ES:DI must be 0:0 + movw $0x4f15, %ax + xorw %bx, %bx + xorw %cx, %cx + int $0x10 + popw %es # restore ES + + cmpb $0x00, %ah # call successful + jne no_edid + + cmpb $0x4f, %al # function supported + jne no_edid + movw $0x4f15, %ax # do VBE/DDC movw $0x01, %bx movw $0x00, %cx @@ -1953,12 +1970,14 @@ store_edid: movw $0x140, %di int $0x10 +no_edid: popw %di # restore all registers popw %dx popw %cx popw %bx popw %ax popw %es +#endif ret # VIDEO_SELECT-only variables diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile index 426d20f4b72e..15b538a8b7f7 100644 --- a/arch/x86_64/crypto/Makefile +++ b/arch/x86_64/crypto/Makefile @@ -5,5 +5,8 @@ # obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o aes-x86_64-y := aes-x86_64-asm.o aes.o +twofish-x86_64-y := twofish-x86_64-asm.o twofish.o + diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S index 483cbb23ab8d..26b40de4d0b0 100644 --- a/arch/x86_64/crypto/aes-x86_64-asm.S +++ b/arch/x86_64/crypto/aes-x86_64-asm.S @@ -15,6 +15,10 @@ .text +#include <asm/asm-offsets.h> + +#define BASE crypto_tfm_ctx_offset + #define R1 %rax #define R1E %eax #define R1X %ax @@ -46,19 +50,19 @@ #define R10 %r10 #define R11 %r11 -#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ .global FUNC; \ .type FUNC,@function; \ .align 8; \ FUNC: movq r1,r2; \ movq r3,r4; \ - leaq BASE+52(r8),r9; \ + leaq BASE+KEY+52(r8),r9; \ movq r10,r11; \ movl (r7),r5 ## E; \ movl 4(r7),r1 ## E; \ movl 8(r7),r6 ## E; \ movl 12(r7),r7 ## E; \ - movl (r8),r10 ## E; \ + movl BASE(r8),r10 ## E; \ xorl -48(r9),r5 ## E; \ xorl -44(r9),r1 ## E; \ xorl -40(r9),r6 ## E; \ @@ -128,8 +132,8 @@ FUNC: movq r1,r2; \ movl r3 ## E,r1 ## E; \ movl r4 ## E,r2 ## E; -#define entry(FUNC,BASE,B128,B192) \ - prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) +#define entry(FUNC,KEY,B128,B192) \ + prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) #define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) @@ -147,9 +151,9 @@ FUNC: movq r1,r2; \ #define decrypt_final(TAB,OFFSET) \ round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) -/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */ +/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_encrypt,0,enc128,enc192) + entry(aes_enc_blk,0,enc128,enc192) encrypt_round(aes_ft_tab,-96) encrypt_round(aes_ft_tab,-80) enc192: encrypt_round(aes_ft_tab,-64) @@ -166,9 +170,9 @@ enc128: encrypt_round(aes_ft_tab,-32) encrypt_final(aes_fl_tab,112) return -/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */ +/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_decrypt,240,dec128,dec192) + entry(aes_dec_blk,240,dec128,dec192) decrypt_round(aes_it_tab,-96) decrypt_round(aes_it_tab,-80) dec192: decrypt_round(aes_it_tab,-64) diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c index 6f77e7700d32..5cdb13ea5cc2 100644 --- a/arch/x86_64/crypto/aes.c +++ b/arch/x86_64/crypto/aes.c @@ -227,14 +227,15 @@ static void __init gen_tabs(void) t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t; \ } -static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, - u32 *flags) +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) { - struct aes_ctx *ctx = ctx_arg; + struct aes_ctx *ctx = crypto_tfm_ctx(tfm); const __le32 *key = (const __le32 *)in_key; + u32 *flags = &tfm->crt_flags; u32 i, j, t, u, v, w; - if (key_len != 16 && key_len != 24 && key_len != 32) { + if (key_len % 8) { *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; return -EINVAL; } @@ -283,8 +284,18 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, return 0; } -extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in); -extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in); +asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); +asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + aes_enc_blk(tfm, dst, src); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + aes_dec_blk(tfm, dst, src); +} static struct crypto_alg aes_alg = { .cra_name = "aes", diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S new file mode 100644 index 000000000000..35974a586615 --- /dev/null +++ b/arch/x86_64/crypto/twofish-x86_64-asm.S @@ -0,0 +1,324 @@ +/*************************************************************************** +* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * +***************************************************************************/ + +.file "twofish-x86_64-asm.S" +.text + +#include <asm/asm-offsets.h> + +#define a_offset 0 +#define b_offset 4 +#define c_offset 8 +#define d_offset 12 + +/* Structure of the crypto context struct*/ + +#define s0 0 /* S0 Array 256 Words each */ +#define s1 1024 /* S1 Array */ +#define s2 2048 /* S2 Array */ +#define s3 3072 /* S3 Array */ +#define w 4096 /* 8 whitening keys (word) */ +#define k 4128 /* key 1-32 ( word ) */ + +/* define a few register aliases to allow macro substitution */ + +#define R0 %rax +#define R0D %eax +#define R0B %al +#define R0H %ah + +#define R1 %rbx +#define R1D %ebx +#define R1B %bl +#define R1H %bh + +#define R2 %rcx +#define R2D %ecx +#define R2B %cl +#define R2H %ch + +#define R3 %rdx +#define R3D %edx +#define R3B %dl +#define R3H %dh + + +/* performs input whitening */ +#define input_whitening(src,context,offset)\ + xor w+offset(context), src; + +/* performs input whitening */ +#define output_whitening(src,context,offset)\ + xor w+16+offset(context), src; + + +/* + * a input register containing a (rotated 16) + * b input register containing b + * c input register containing c + * d input register containing d (already rol $1) + * operations on a and b are interleaved to increase performance + */ +#define encrypt_round(a,b,c,d,round)\ + movzx b ## B, %edi;\ + mov s1(%r11,%rdi,4),%r8d;\ + movzx a ## B, %edi;\ + mov s2(%r11,%rdi,4),%r9d;\ + movzx b ## H, %edi;\ + ror $16, b ## D;\ + xor s2(%r11,%rdi,4),%r8d;\ + movzx a ## H, %edi;\ + ror $16, a ## D;\ + xor s3(%r11,%rdi,4),%r9d;\ + movzx b ## B, %edi;\ + xor s3(%r11,%rdi,4),%r8d;\ + movzx a ## B, %edi;\ + xor (%r11,%rdi,4), %r9d;\ + movzx b ## H, %edi;\ + ror $15, b ## D;\ + xor (%r11,%rdi,4), %r8d;\ + movzx a ## H, %edi;\ + xor s1(%r11,%rdi,4),%r9d;\ + add %r8d, %r9d;\ + add %r9d, %r8d;\ + add k+round(%r11), %r9d;\ + xor %r9d, c ## D;\ + rol $15, c ## D;\ + add k+4+round(%r11),%r8d;\ + xor %r8d, d ## D; + +/* + * a input register containing a(rotated 16) + * b input register containing b + * c input register containing c + * d input register containing d (already rol $1) + * operations on a and b are interleaved to increase performance + * during the round a and b are prepared for the output whitening + */ +#define encrypt_last_round(a,b,c,d,round)\ + mov b ## D, %r10d;\ + shl $32, %r10;\ + movzx b ## B, %edi;\ + mov s1(%r11,%rdi,4),%r8d;\ + movzx a ## B, %edi;\ + mov s2(%r11,%rdi,4),%r9d;\ + movzx b ## H, %edi;\ + ror $16, b ## D;\ + xor s2(%r11,%rdi,4),%r8d;\ + movzx a ## H, %edi;\ + ror $16, a ## D;\ + xor s3(%r11,%rdi,4),%r9d;\ + movzx b ## B, %edi;\ + xor s3(%r11,%rdi,4),%r8d;\ + movzx a ## B, %edi;\ + xor (%r11,%rdi,4), %r9d;\ + xor a, %r10;\ + movzx b ## H, %edi;\ + xor (%r11,%rdi,4), %r8d;\ + movzx a ## H, %edi;\ + xor s1(%r11,%rdi,4),%r9d;\ + add %r8d, %r9d;\ + add %r9d, %r8d;\ + add k+round(%r11), %r9d;\ + xor %r9d, c ## D;\ + ror $1, c ## D;\ + add k+4+round(%r11),%r8d;\ + xor %r8d, d ## D + +/* + * a input register containing a + * b input register containing b (rotated 16) + * c input register containing c (already rol $1) + * d input register containing d + * operations on a and b are interleaved to increase performance + */ +#define decrypt_round(a,b,c,d,round)\ + movzx a ## B, %edi;\ + mov (%r11,%rdi,4), %r9d;\ + movzx b ## B, %edi;\ + mov s3(%r11,%rdi,4),%r8d;\ + movzx a ## H, %edi;\ + ror $16, a ## D;\ + xor s1(%r11,%rdi,4),%r9d;\ + movzx b ## H, %edi;\ + ror $16, b ## D;\ + xor (%r11,%rdi,4), %r8d;\ + movzx a ## B, %edi;\ + xor s2(%r11,%rdi,4),%r9d;\ + movzx b ## B, %edi;\ + xor s1(%r11,%rdi,4),%r8d;\ + movzx a ## H, %edi;\ + ror $15, a ## D;\ + xor s3(%r11,%rdi,4),%r9d;\ + movzx b ## H, %edi;\ + xor s2(%r11,%rdi,4),%r8d;\ + add %r8d, %r9d;\ + add %r9d, %r8d;\ + add k+round(%r11), %r9d;\ + xor %r9d, c ## D;\ + add k+4+round(%r11),%r8d;\ + xor %r8d, d ## D;\ + rol $15, d ## D; + +/* + * a input register containing a + * b input register containing b + * c input register containing c (already rol $1) + * d input register containing d + * operations on a and b are interleaved to increase performance + * during the round a and b are prepared for the output whitening + */ +#define decrypt_last_round(a,b,c,d,round)\ + movzx a ## B, %edi;\ + mov (%r11,%rdi,4), %r9d;\ + movzx b ## B, %edi;\ + mov s3(%r11,%rdi,4),%r8d;\ + movzx b ## H, %edi;\ + ror $16, b ## D;\ + xor (%r11,%rdi,4), %r8d;\ + movzx a ## H, %edi;\ + mov b ## D, %r10d;\ + shl $32, %r10;\ + xor a, %r10;\ + ror $16, a ## D;\ + xor s1(%r11,%rdi,4),%r9d;\ + movzx b ## B, %edi;\ + xor s1(%r11,%rdi,4),%r8d;\ + movzx a ## B, %edi;\ + xor s2(%r11,%rdi,4),%r9d;\ + movzx b ## H, %edi;\ + xor s2(%r11,%rdi,4),%r8d;\ + movzx a ## H, %edi;\ + xor s3(%r11,%rdi,4),%r9d;\ + add %r8d, %r9d;\ + add %r9d, %r8d;\ + add k+round(%r11), %r9d;\ + xor %r9d, c ## D;\ + add k+4+round(%r11),%r8d;\ + xor %r8d, d ## D;\ + ror $1, d ## D; + +.align 8 +.global twofish_enc_blk +.global twofish_dec_blk + +twofish_enc_blk: + pushq R1 + + /* %rdi contains the crypto tfm adress */ + /* %rsi contains the output adress */ + /* %rdx contains the input adress */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ + /* ctx adress is moved to free one non-rex register + as target for the 8bit high operations */ + mov %rdi, %r11 + + movq (R3), R1 + movq 8(R3), R3 + input_whitening(R1,%r11,a_offset) + input_whitening(R3,%r11,c_offset) + mov R1D, R0D + rol $16, R0D + shr $32, R1 + mov R3D, R2D + shr $32, R3 + rol $1, R3D + + encrypt_round(R0,R1,R2,R3,0); + encrypt_round(R2,R3,R0,R1,8); + encrypt_round(R0,R1,R2,R3,2*8); + encrypt_round(R2,R3,R0,R1,3*8); + encrypt_round(R0,R1,R2,R3,4*8); + encrypt_round(R2,R3,R0,R1,5*8); + encrypt_round(R0,R1,R2,R3,6*8); + encrypt_round(R2,R3,R0,R1,7*8); + encrypt_round(R0,R1,R2,R3,8*8); + encrypt_round(R2,R3,R0,R1,9*8); + encrypt_round(R0,R1,R2,R3,10*8); + encrypt_round(R2,R3,R0,R1,11*8); + encrypt_round(R0,R1,R2,R3,12*8); + encrypt_round(R2,R3,R0,R1,13*8); + encrypt_round(R0,R1,R2,R3,14*8); + encrypt_last_round(R2,R3,R0,R1,15*8); + + + output_whitening(%r10,%r11,a_offset) + movq %r10, (%rsi) + + shl $32, R1 + xor R0, R1 + + output_whitening(R1,%r11,c_offset) + movq R1, 8(%rsi) + + popq R1 + movq $1,%rax + ret + +twofish_dec_blk: + pushq R1 + + /* %rdi contains the crypto tfm adress */ + /* %rsi contains the output adress */ + /* %rdx contains the input adress */ + add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ + /* ctx adress is moved to free one non-rex register + as target for the 8bit high operations */ + mov %rdi, %r11 + + movq (R3), R1 + movq 8(R3), R3 + output_whitening(R1,%r11,a_offset) + output_whitening(R3,%r11,c_offset) + mov R1D, R0D + shr $32, R1 + rol $16, R1D + mov R3D, R2D + shr $32, R3 + rol $1, R2D + + decrypt_round(R0,R1,R2,R3,15*8); + decrypt_round(R2,R3,R0,R1,14*8); + decrypt_round(R0,R1,R2,R3,13*8); + decrypt_round(R2,R3,R0,R1,12*8); + decrypt_round(R0,R1,R2,R3,11*8); + decrypt_round(R2,R3,R0,R1,10*8); + decrypt_round(R0,R1,R2,R3,9*8); + decrypt_round(R2,R3,R0,R1,8*8); + decrypt_round(R0,R1,R2,R3,7*8); + decrypt_round(R2,R3,R0,R1,6*8); + decrypt_round(R0,R1,R2,R3,5*8); + decrypt_round(R2,R3,R0,R1,4*8); + decrypt_round(R0,R1,R2,R3,3*8); + decrypt_round(R2,R3,R0,R1,2*8); + decrypt_round(R0,R1,R2,R3,1*8); + decrypt_last_round(R2,R3,R0,R1,0); + + input_whitening(%r10,%r11,a_offset) + movq %r10, (%rsi) + + shl $32, R1 + xor R0, R1 + + input_whitening(R1,%r11,c_offset) + movq R1, 8(%rsi) + + popq R1 + movq $1,%rax + ret diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c new file mode 100644 index 000000000000..182d91d5cfb9 --- /dev/null +++ b/arch/x86_64/crypto/twofish.c @@ -0,0 +1,97 @@ +/* + * Glue Code for optimized x86_64 assembler version of TWOFISH + * + * Originally Twofish for GPG + * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998 + * 256-bit key length added March 20, 1999 + * Some modifications to reduce the text size by Werner Koch, April, 1998 + * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com> + * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net> + * + * The original author has disclaimed all copyright interest in this + * code and thus put it in the public domain. The subsequent authors + * have put this under the GNU General Public License. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + * This code is a "clean room" implementation, written from the paper + * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey, + * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available + * through http://www.counterpane.com/twofish.html + * + * For background information on multiplication in finite fields, used for + * the matrix operations in the key schedule, see the book _Contemporary + * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the + * Third Edition. + */ + +#include <crypto/twofish.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> + +asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); +asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); + +static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + twofish_enc_blk(tfm, dst, src); +} + +static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + twofish_dec_blk(tfm, dst, src); +} + +static struct crypto_alg alg = { + .cra_name = "twofish", + .cra_driver_name = "twofish-x86_64", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = TF_MIN_KEY_SIZE, + .cia_max_keysize = TF_MAX_KEY_SIZE, + .cia_setkey = twofish_setkey, + .cia_encrypt = twofish_encrypt, + .cia_decrypt = twofish_decrypt + } + } +}; + +static int __init init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized"); +MODULE_ALIAS("twofish"); diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69db0c0721d1..4844b543bed0 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,11 +1,14 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-rc1-git11 -# Sun Apr 16 07:22:36 2006 +# Linux kernel version: 2.6.18-git7 +# Wed Sep 27 21:53:10 2006 # CONFIG_X86_64=y CONFIG_64BIT=y CONFIG_X86=y +CONFIG_ZONE_DMA32=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y CONFIG_MMU=y CONFIG_RWSEM_GENERIC_SPINLOCK=y @@ -17,6 +20,8 @@ CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_IOMAP=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_DMI=y +CONFIG_AUDIT_ARCH=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # Code maturity level options @@ -34,17 +39,17 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y +# CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" -CONFIG_UID16=y -CONFIG_VM86=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y # CONFIG_EMBEDDED is not set +CONFIG_UID16=y +CONFIG_SYSCTL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -57,7 +62,8 @@ CONFIG_FUTEX=y CONFIG_EPOLL=y CONFIG_SHMEM=y CONFIG_SLAB=y -CONFIG_DOUBLEFAULT=y +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 # CONFIG_SLOB is not set @@ -138,13 +144,16 @@ CONFIG_NEED_MULTIPLE_NODES=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_MIGRATION=y +CONFIG_RESOURCES_64BIT=y CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID=y CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y CONFIG_NR_CPUS=32 CONFIG_HOTPLUG_CPU=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y -CONFIG_GART_IOMMU=y +CONFIG_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set CONFIG_SWIOTLB=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y @@ -153,11 +162,13 @@ CONFIG_X86_MCE_AMD=y # CONFIG_CRASH_DUMP is not set CONFIG_PHYSICAL_START=0x200000 CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_1000 is not set CONFIG_HZ=250 # CONFIG_REORDER is not set +CONFIG_K8_NB=y CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_ISA_DMA_API=y @@ -169,6 +180,7 @@ CONFIG_GENERIC_PENDING_IRQ=y CONFIG_PM=y # CONFIG_PM_LEGACY is not set # CONFIG_PM_DEBUG is not set +# CONFIG_PM_SYSFS_DEPRECATED is not set CONFIG_SOFTWARE_SUSPEND=y CONFIG_PM_STD_PARTITION="" CONFIG_SUSPEND_SMP=y @@ -186,13 +198,14 @@ CONFIG_ACPI_BUTTON=y # CONFIG_ACPI_VIDEO is not set # CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y +# CONFIG_ACPI_DOCK is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y # CONFIG_ACPI_ASUS is not set # CONFIG_ACPI_IBM is not set -CONFIG_ACPI_TOSHIBA=y +# CONFIG_ACPI_TOSHIBA is not set CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y @@ -200,14 +213,14 @@ CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y +# CONFIG_ACPI_SBS is not set # # CPU Frequency scaling # CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_TABLE=y -# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_DEBUG=y CONFIG_CPU_FREQ_STAT=y # CONFIG_CPU_FREQ_STAT_DETAILS is not set CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y @@ -240,6 +253,7 @@ CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCIEPORTBUS=y CONFIG_PCI_MSI=y +# CONFIG_PCI_MULTITHREAD_PROBE is not set # CONFIG_PCI_DEBUG is not set # @@ -293,19 +307,29 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_BIC=y +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set # CONFIG_INET6_AH is not set # CONFIG_INET6_ESP is not set # CONFIG_INET6_IPCOMP is not set +# CONFIG_IPV6_MIP6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET6_XFRM_MODE_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set # CONFIG_IPV6_TUNNEL is not set +# CONFIG_IPV6_SUBTREES is not set +# CONFIG_IPV6_MULTIPLE_TABLES is not set +# CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set # @@ -331,7 +355,6 @@ CONFIG_IPV6=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -344,6 +367,7 @@ CONFIG_IPV6=y # Network testing # # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set @@ -360,6 +384,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set +# CONFIG_SYS_HYPERVISOR is not set # # Connector - unified userspace <-> kernelspace linker @@ -398,6 +423,7 @@ CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -470,6 +496,7 @@ CONFIG_IDEDMA_AUTO=y # # CONFIG_RAID_ATTRS is not set CONFIG_SCSI=y +CONFIG_SCSI_NETLINK=y # CONFIG_SCSI_PROC_FS is not set # @@ -478,8 +505,9 @@ CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y # CONFIG_CHR_DEV_ST is not set # CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set +CONFIG_BLK_DEV_SR=y +# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_CHR_DEV_SG=y # CONFIG_CHR_DEV_SCH is not set # @@ -490,12 +518,13 @@ CONFIG_SCSI_CONSTANTS=y # CONFIG_SCSI_LOGGING is not set # -# SCSI Transport Attributes +# SCSI Transports # CONFIG_SCSI_SPI_ATTRS=y CONFIG_SCSI_FC_ATTRS=y # CONFIG_SCSI_ISCSI_ATTRS is not set -# CONFIG_SCSI_SAS_ATTRS is not set +CONFIG_SCSI_SAS_ATTRS=y +# CONFIG_SCSI_SAS_LIBSAS is not set # # SCSI low-level drivers @@ -514,28 +543,14 @@ CONFIG_AIC79XX_RESET_DELAY_MS=4000 # CONFIG_AIC79XX_DEBUG_ENABLE is not set CONFIG_AIC79XX_DEBUG_MASK=0 # CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_AIC94XX is not set +# CONFIG_SCSI_ARCMSR is not set CONFIG_MEGARAID_NEWGEN=y CONFIG_MEGARAID_MM=y CONFIG_MEGARAID_MAILBOX=y # CONFIG_MEGARAID_LEGACY is not set CONFIG_MEGARAID_SAS=y -CONFIG_SCSI_SATA=y -CONFIG_SCSI_SATA_AHCI=y -# CONFIG_SCSI_SATA_SVW is not set -CONFIG_SCSI_ATA_PIIX=y -# CONFIG_SCSI_SATA_MV is not set -CONFIG_SCSI_SATA_NV=y -# CONFIG_SCSI_PDC_ADMA is not set -# CONFIG_SCSI_SATA_QSTOR is not set -# CONFIG_SCSI_SATA_PROMISE is not set -# CONFIG_SCSI_SATA_SX4 is not set -CONFIG_SCSI_SATA_SIL=y -# CONFIG_SCSI_SATA_SIL24 is not set -# CONFIG_SCSI_SATA_SIS is not set -# CONFIG_SCSI_SATA_ULI is not set -CONFIG_SCSI_SATA_VIA=y -# CONFIG_SCSI_SATA_VITESSE is not set -CONFIG_SCSI_SATA_INTEL_COMBINED=y +# CONFIG_SCSI_HPTIOP is not set # CONFIG_SCSI_BUSLOGIC is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_EATA is not set @@ -544,6 +559,7 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_IPS is not set # CONFIG_SCSI_INITIO is not set # CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_STEX is not set # CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_IPR is not set # CONFIG_SCSI_QLOGIC_1280 is not set @@ -554,6 +570,62 @@ CONFIG_SCSI_SATA_INTEL_COMBINED=y # CONFIG_SCSI_DEBUG is not set # +# Serial ATA (prod) and Parallel ATA (experimental) drivers +# +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_SVW=y +CONFIG_ATA_PIIX=y +# CONFIG_SATA_MV is not set +CONFIG_SATA_NV=y +# CONFIG_PDC_ADMA is not set +# CONFIG_SATA_QSTOR is not set +# CONFIG_SATA_PROMISE is not set +# CONFIG_SATA_SX4 is not set +CONFIG_SATA_SIL=y +# CONFIG_SATA_SIL24 is not set +# CONFIG_SATA_SIS is not set +# CONFIG_SATA_ULI is not set +CONFIG_SATA_VIA=y +# CONFIG_SATA_VITESSE is not set +CONFIG_SATA_INTEL_COMBINED=y +# CONFIG_PATA_ALI is not set +# CONFIG_PATA_AMD is not set +# CONFIG_PATA_ARTOP is not set +# CONFIG_PATA_ATIIXP is not set +# CONFIG_PATA_CMD64X is not set +# CONFIG_PATA_CS5520 is not set +# CONFIG_PATA_CS5530 is not set +# CONFIG_PATA_CYPRESS is not set +# CONFIG_PATA_EFAR is not set +# CONFIG_ATA_GENERIC is not set +# CONFIG_PATA_HPT366 is not set +# CONFIG_PATA_HPT37X is not set +# CONFIG_PATA_HPT3X2N is not set +# CONFIG_PATA_HPT3X3 is not set +# CONFIG_PATA_IT821X is not set +# CONFIG_PATA_JMICRON is not set +# CONFIG_PATA_LEGACY is not set +# CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MPIIX is not set +# CONFIG_PATA_OLDPIIX is not set +# CONFIG_PATA_NETCELL is not set +# CONFIG_PATA_NS87410 is not set +# CONFIG_PATA_OPTI is not set +# CONFIG_PATA_OPTIDMA is not set +# CONFIG_PATA_PDC_OLD is not set +# CONFIG_PATA_QDI is not set +# CONFIG_PATA_RADISYS is not set +# CONFIG_PATA_RZ1000 is not set +# CONFIG_PATA_SC1200 is not set +# CONFIG_PATA_SERVERWORKS is not set +# CONFIG_PATA_PDC2027X is not set +# CONFIG_PATA_SIL680 is not set +# CONFIG_PATA_SIS is not set +# CONFIG_PATA_VIA is not set +# CONFIG_PATA_WINBOND is not set + +# # Multi-device support (RAID and LVM) # CONFIG_MD=y @@ -571,7 +643,7 @@ CONFIG_BLK_DEV_DM=y CONFIG_FUSION=y CONFIG_FUSION_SPI=y # CONFIG_FUSION_FC is not set -# CONFIG_FUSION_SAS is not set +CONFIG_FUSION_SAS=y CONFIG_FUSION_MAX_SGE=128 # CONFIG_FUSION_CTL is not set @@ -591,10 +663,7 @@ CONFIG_IEEE1394=y # # Device Drivers # - -# -# Texas Instruments PCILynx requires I2C -# +# CONFIG_IEEE1394_PCILYNX is not set CONFIG_IEEE1394_OHCI1394=y # @@ -645,14 +714,24 @@ CONFIG_VORTEX=y # # Tulip family network device support # -# CONFIG_NET_TULIP is not set +CONFIG_NET_TULIP=y +# CONFIG_DE2104X is not set +CONFIG_TULIP=y +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +# CONFIG_TULIP_NAPI is not set +# CONFIG_DE4X5 is not set +# CONFIG_WINBOND_840 is not set +# CONFIG_DM9102 is not set +# CONFIG_ULI526X is not set # CONFIG_HP100 is not set CONFIG_NET_PCI=y # CONFIG_PCNET32 is not set # CONFIG_AMD8111_ETH is not set # CONFIG_ADAPTEC_STARFIRE is not set -# CONFIG_B44 is not set +CONFIG_B44=y CONFIG_FORCEDETH=y +# CONFIG_FORCEDETH_NAPI is not set # CONFIG_DGRS is not set # CONFIG_EEPRO100 is not set CONFIG_E100=y @@ -688,7 +767,8 @@ CONFIG_E1000=y # CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y -# CONFIG_BNX2 is not set +CONFIG_BNX2=y +# CONFIG_QLA3XXX is not set # # Ethernet (10000 Mbit) @@ -697,6 +777,7 @@ CONFIG_TIGON3=y # CONFIG_IXGB is not set CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set +# CONFIG_MYRI10GE is not set # # Token Ring devices @@ -786,6 +867,7 @@ CONFIG_SERIO_LIBPS2=y CONFIG_VT=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y +# CONFIG_VT_HW_CONSOLE_BINDING is not set # CONFIG_SERIAL_NONSTANDARD is not set # @@ -816,45 +898,11 @@ CONFIG_LEGACY_PTY_COUNT=256 # # Watchdog Cards # -CONFIG_WATCHDOG=y -# CONFIG_WATCHDOG_NOWAYOUT is not set - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=y -# CONFIG_ACQUIRE_WDT is not set -# CONFIG_ADVANTECH_WDT is not set -# CONFIG_ALIM1535_WDT is not set -# CONFIG_ALIM7101_WDT is not set -# CONFIG_SC520_WDT is not set -# CONFIG_EUROTECH_WDT is not set -# CONFIG_IB700_WDT is not set -# CONFIG_IBMASR is not set -# CONFIG_WAFER_WDT is not set -# CONFIG_I6300ESB_WDT is not set -# CONFIG_I8XX_TCO is not set -# CONFIG_SC1200_WDT is not set -# CONFIG_60XX_WDT is not set -# CONFIG_SBC8360_WDT is not set -# CONFIG_CPU5_WDT is not set -# CONFIG_W83627HF_WDT is not set -# CONFIG_W83877F_WDT is not set -# CONFIG_W83977F_WDT is not set -# CONFIG_MACHZ_WDT is not set -# CONFIG_SBC_EPX_C3_WATCHDOG is not set - -# -# PCI-based Watchdog Cards -# -# CONFIG_PCIPCWATCHDOG is not set -# CONFIG_WDTPCI is not set - -# -# USB-based Watchdog Cards -# -# CONFIG_USBPCWATCHDOG is not set +# CONFIG_WATCHDOG is not set CONFIG_HW_RANDOM=y +CONFIG_HW_RANDOM_INTEL=y +CONFIG_HW_RANDOM_AMD=y +# CONFIG_HW_RANDOM_GEODE is not set # CONFIG_NVRAM is not set CONFIG_RTC=y # CONFIG_DTLK is not set @@ -871,6 +919,7 @@ CONFIG_AGP_INTEL=y # CONFIG_AGP_VIA is not set # CONFIG_DRM is not set # CONFIG_MWAVE is not set +# CONFIG_PC8736x_GPIO is not set CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=256 CONFIG_HPET=y @@ -887,7 +936,56 @@ CONFIG_HPET_MMAP=y # # I2C support # -# CONFIG_I2C is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m + +# +# I2C Algorithms +# +# CONFIG_I2C_ALGOBIT is not set +# CONFIG_I2C_ALGOPCF is not set +# CONFIG_I2C_ALGOPCA is not set + +# +# I2C Hardware Bus support +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +CONFIG_I2C_ISA=m +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PROSAVAGE is not set +# CONFIG_I2C_SAVAGE4 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +# CONFIG_I2C_PCA_ISA is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_SENSORS_DS1337 is not set +# CONFIG_SENSORS_DS1374 is not set +# CONFIG_SENSORS_EEPROM is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # # SPI support @@ -898,14 +996,51 @@ CONFIG_HPET_MMAP=y # # Dallas's 1-wire bus # -# CONFIG_W1 is not set # # Hardware Monitoring support # CONFIG_HWMON=y # CONFIG_HWMON_VID is not set +# CONFIG_SENSORS_ABITUGURU is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_FSCHER is not set +# CONFIG_SENSORS_FSCPOS is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM63 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +CONFIG_SENSORS_SMSC47B397=m +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set # CONFIG_SENSORS_HDAPS is not set # CONFIG_HWMON_DEBUG_CHIP is not set @@ -918,6 +1053,7 @@ CONFIG_HWMON=y # Multimedia devices # # CONFIG_VIDEO_DEV is not set +CONFIG_VIDEO_V4L2=y # # Digital Video Broadcasting Devices @@ -928,8 +1064,8 @@ CONFIG_HWMON=y # # Graphics support # +# CONFIG_FIRMWARE_EDID is not set # CONFIG_FB is not set -CONFIG_VIDEO_SELECT=y # # Console display driver support @@ -937,7 +1073,9 @@ CONFIG_VIDEO_SELECT=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 +CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Sound @@ -953,28 +1091,17 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -CONFIG_OBSOLETE_OSS_DRIVER=y +CONFIG_OSS_OBSOLETE_DRIVER=y # CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_CMPCI is not set # CONFIG_SOUND_EMU10K1 is not set # CONFIG_SOUND_FUSION is not set -# CONFIG_SOUND_CS4281 is not set -# CONFIG_SOUND_ES1370 is not set # CONFIG_SOUND_ES1371 is not set -# CONFIG_SOUND_ESSSOLO1 is not set -# CONFIG_SOUND_MAESTRO is not set -# CONFIG_SOUND_MAESTRO3 is not set CONFIG_SOUND_ICH=y -# CONFIG_SOUND_SONICVIBES is not set # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set # CONFIG_SOUND_MSNDPIN is not set # CONFIG_SOUND_VIA82CXXX is not set # CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_ALI5455 is not set -# CONFIG_SOUND_FORTE is not set -# CONFIG_SOUND_RME96XX is not set -# CONFIG_SOUND_AD1980 is not set # # USB support @@ -1000,6 +1127,7 @@ CONFIG_USB_DEVICEFS=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_SPLIT_ISO is not set # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_EHCI_TT_NEWSCHED is not set # CONFIG_USB_ISP116X_HCD is not set CONFIG_USB_OHCI_HCD=y # CONFIG_USB_OHCI_BIG_ENDIAN is not set @@ -1089,10 +1217,12 @@ CONFIG_USB_MON=y # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set # CONFIG_USB_LED is not set +# CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set # CONFIG_USB_PHIDGETKIT is not set # CONFIG_USB_PHIDGETSERVO is not set # CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_APPLEDISPLAY is not set # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set # CONFIG_USB_TEST is not set @@ -1128,7 +1258,6 @@ CONFIG_USB_MON=y # InfiniBand support # # CONFIG_INFINIBAND is not set -# CONFIG_IPATH_CORE is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -1141,6 +1270,19 @@ CONFIG_USB_MON=y # CONFIG_RTC_CLASS is not set # +# DMA Engine support +# +# CONFIG_DMA_ENGINE is not set + +# +# DMA Clients +# + +# +# DMA Devices +# + +# # Firmware Drivers # # CONFIG_EDD is not set @@ -1175,9 +1317,10 @@ CONFIG_FS_POSIX_ACL=y # CONFIG_MINIX_FS is not set # CONFIG_ROMFS_FS is not set CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y # CONFIG_QUOTA is not set CONFIG_DNOTIFY=y -CONFIG_AUTOFS_FS=y +# CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set @@ -1316,26 +1459,38 @@ CONFIG_KPROBES=y # # Kernel hacking # +CONFIG_TRACE_IRQFLAGS_SUPPORT=y # CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_MUST_CHECK is not set CONFIG_MAGIC_SYSRQ=y +CONFIG_UNUSED_SYMBOLS=y CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_DEBUG_RODATA is not set # CONFIG_IOMMU_DEBUG is not set +CONFIG_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACK_USAGE is not set # # Security options @@ -1349,13 +1504,11 @@ CONFIG_DEBUG_FS=y # CONFIG_CRYPTO is not set # -# Hardware crypto devices -# - -# # Library routines # # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set +CONFIG_ZLIB_INFLATE=y +CONFIG_PLIST=y diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index e9263b4975e0..cdae36435e21 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -11,6 +11,9 @@ obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) obj-$(CONFIG_IA32_AOUT) += ia32_aout.o +audit-class-$(CONFIG_AUDIT) := audit.o +obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) + $(obj)/syscall32_syscall.o: \ $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) @@ -20,6 +23,7 @@ targets := $(foreach F,sysenter syscall,vsyscall-$F.o vsyscall-$F.so) # The DSO images are built using a special linker script quiet_cmd_syscall = SYSCALL $@ cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) \ -Wl,-soname=linux-gate.so.1 -o $@ \ -Wl,-T,$(filter-out FORCE,$^) diff --git a/arch/x86_64/ia32/audit.c b/arch/x86_64/ia32/audit.c new file mode 100644 index 000000000000..92d7d0c8d93f --- /dev/null +++ b/arch/x86_64/ia32/audit.c @@ -0,0 +1,37 @@ +#include <asm-i386/unistd.h> + +unsigned ia32_dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +unsigned ia32_chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +unsigned ia32_write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +unsigned ia32_read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +int ia32_classify_syscall(unsigned syscall) +{ + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_socketcall: + return 4; + case __NR_execve: + return 5; + default: + return 1; + } +} diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c index 1c23095f1813..2c8209a3605a 100644 --- a/arch/x86_64/ia32/fpu32.c +++ b/arch/x86_64/ia32/fpu32.c @@ -2,7 +2,6 @@ * Copyright 2002 Andi Kleen, SuSE Labs. * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. * This is used for ptrace, signals and coredumps in 32bit emulation. - * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $ */ #include <linux/sched.h> diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index 3bf58af98936..396d3c100011 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -333,7 +333,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) return error; } - error = bprm->file->f_op->read(bprm->file, (char *)text_addr, + error = bprm->file->f_op->read(bprm->file, + (char __user *)text_addr, ex.a_text+ex.a_data, &pos); if ((signed long)error < 0) { send_sig(SIGKILL, current, 0); @@ -366,7 +367,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) down_write(¤t->mm->mmap_sem); do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); up_write(¤t->mm->mmap_sem); - bprm->file->f_op->read(bprm->file,(char *)N_TXTADDR(ex), + bprm->file->f_op->read(bprm->file, + (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); flush_icache_range((unsigned long) N_TXTADDR(ex), (unsigned long) N_TXTADDR(ex) + @@ -477,7 +479,7 @@ static int load_aout_library(struct file *file) do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); up_write(¤t->mm->mmap_sem); - file->f_op->read(file, (char *)start_addr, + file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); flush_icache_range((unsigned long) start_addr, (unsigned long) start_addr + ex.a_text + ex.a_data); diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 926c4743d13b..2fd5a67fd435 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -73,39 +73,44 @@ typedef elf_greg_t elf_gregset_t[ELF_NGREG]; * Dumping its extra ELF program headers includes all the other information * a debugger needs to easily find how the vsyscall DSO was being used. */ -#define ELF_CORE_EXTRA_PHDRS (VSYSCALL32_EHDR->e_phnum) +#define ELF_CORE_EXTRA_PHDRS (find_vma(current->mm, VSYSCALL32_BASE) ? \ + (VSYSCALL32_EHDR->e_phnum) : 0) #define ELF_CORE_WRITE_EXTRA_PHDRS \ do { \ - const struct elf32_phdr *const vsyscall_phdrs = \ - (const struct elf32_phdr *) (VSYSCALL32_BASE \ - + VSYSCALL32_EHDR->e_phoff); \ - int i; \ - Elf32_Off ofs = 0; \ - for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ - struct elf32_phdr phdr = vsyscall_phdrs[i]; \ - if (phdr.p_type == PT_LOAD) { \ - BUG_ON(ofs != 0); \ - ofs = phdr.p_offset = offset; \ - phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ - phdr.p_filesz = phdr.p_memsz; \ - offset += phdr.p_filesz; \ + if (find_vma(current->mm, VSYSCALL32_BASE)) { \ + const struct elf32_phdr *const vsyscall_phdrs = \ + (const struct elf32_phdr *) (VSYSCALL32_BASE \ + + VSYSCALL32_EHDR->e_phoff);\ + int i; \ + Elf32_Off ofs = 0; \ + for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ + struct elf32_phdr phdr = vsyscall_phdrs[i]; \ + if (phdr.p_type == PT_LOAD) { \ + BUG_ON(ofs != 0); \ + ofs = phdr.p_offset = offset; \ + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ + phdr.p_filesz = phdr.p_memsz; \ + offset += phdr.p_filesz; \ + } \ + else \ + phdr.p_offset += ofs; \ + phdr.p_paddr = 0; /* match other core phdrs */ \ + DUMP_WRITE(&phdr, sizeof(phdr)); \ } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ } \ } while (0) #define ELF_CORE_WRITE_EXTRA_DATA \ do { \ - const struct elf32_phdr *const vsyscall_phdrs = \ - (const struct elf32_phdr *) (VSYSCALL32_BASE \ - + VSYSCALL32_EHDR->e_phoff); \ - int i; \ - for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ - if (vsyscall_phdrs[i].p_type == PT_LOAD) \ - DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr, \ - PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ + if (find_vma(current->mm, VSYSCALL32_BASE)) { \ + const struct elf32_phdr *const vsyscall_phdrs = \ + (const struct elf32_phdr *) (VSYSCALL32_BASE \ + + VSYSCALL32_EHDR->e_phoff); \ + int i; \ + for (i = 0; i < VSYSCALL32_EHDR->e_phnum; ++i) { \ + if (vsyscall_phdrs[i].p_type == PT_LOAD) \ + DUMP_WRITE((void *) (u64) vsyscall_phdrs[i].p_vaddr,\ + PAGE_ALIGN(vsyscall_phdrs[i].p_memsz)); \ + } \ } \ } while (0) @@ -182,7 +187,7 @@ struct elf_prpsinfo #define user user32 #define __ASM_X86_64_ELF_H 1 -#define elf_read_implies_exec(ex, have_pt_gnu_stack) (!(have_pt_gnu_stack)) +#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) //#include <asm/ia32.h> #include <linux/elf.h> diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index e0a92439f634..a6ba9951e86c 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -6,8 +6,6 @@ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-12-* x86-64 compatibility mode signal handling by Andi Kleen - * - * $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $ */ #include <linux/sched.h> @@ -115,25 +113,19 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) } asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask, - struct pt_regs *regs) +sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { - sigset_t saveset; - mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; + current->saved_sigmask = current->blocked; siginitset(¤t->blocked, mask); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_thread_flag(TIF_RESTORE_SIGMASK); + return -ERESTARTNOHAND; } asmlinkage long @@ -439,15 +431,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } + err |= __put_user(sig, &frame->sig); if (err) goto give_sigsegv; @@ -494,6 +478,11 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->rsp = (unsigned long) frame; regs->rip = (unsigned long) ka->sa.sa_handler; + /* Make -mregparm=3 work */ + regs->rax = sig; + regs->rdx = 0; + regs->rcx = 0; + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); @@ -501,20 +490,20 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -597,18 +586,18 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); + regs->eflags &= ~TF_MASK; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 4ec594ab1a98..b4aa875e175b 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -13,6 +13,7 @@ #include <asm/thread_info.h> #include <asm/segment.h> #include <asm/vsyscall32.h> +#include <asm/irqflags.h> #include <linux/linkage.h> #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) @@ -70,11 +71,16 @@ */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp swapgs movq %gs:pda_kernelstack, %rsp addq $(PDA_STACKOFFSET),%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs, here we enable it straight after entry: + */ sti movl %ebp,%ebp /* zero extension */ pushq $__USER32_DS @@ -98,7 +104,7 @@ ENTRY(ia32_sysenter_target) pushq %rax CFI_ADJUST_CFA_OFFSET 8 cld - SAVE_ARGS 0,0,1 + SAVE_ARGS 0,0,0 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ 1: movl (%rbp),%r9d @@ -118,6 +124,7 @@ sysenter_do_call: movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call andl $~TS_COMPAT,threadinfo_status(%r10) @@ -132,6 +139,7 @@ sysenter_do_call: CFI_REGISTER rsp,rcx movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ CFI_REGISTER rip,rdx + TRACE_IRQS_ON swapgs sti /* sti only takes effect after the next instruction */ /* sysexit */ @@ -155,6 +163,7 @@ sysenter_tracesys: .previous jmp sysenter_do_call CFI_ENDPROC +ENDPROC(ia32_sysenter_target) /* * 32bit SYSCALL instruction entry. @@ -178,13 +187,18 @@ sysenter_tracesys: */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple - CFI_DEF_CFA rsp,0 + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ swapgs movl %esp,%r8d CFI_REGISTER rsp,r8 movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ sti SAVE_ARGS 8,1,1 movl %eax,%eax /* zero extension */ @@ -219,6 +233,7 @@ cstar_do_call: movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli + TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) jnz int_ret_from_sys_call andl $~TS_COMPAT,threadinfo_status(%r10) @@ -227,6 +242,7 @@ cstar_do_call: CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ + TRACE_IRQS_ON movl RSP-ARGOFFSET(%rsp),%esp CFI_RESTORE rsp swapgs @@ -249,6 +265,7 @@ cstar_tracesys: .quad 1b,ia32_badarg .previous jmp cstar_do_call +END(ia32_cstar_target) ia32_badarg: movq $-EFAULT,%rax @@ -278,13 +295,18 @@ ia32_badarg: ENTRY(ia32_syscall) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-RIP /*CFI_REL_OFFSET ss,SS-RIP*/ CFI_REL_OFFSET rsp,RSP-RIP /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP - swapgs + swapgs + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ sti movl %eax,%eax pushq %rax @@ -314,16 +336,13 @@ ia32_tracesys: LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST jmp ia32_do_syscall +END(ia32_syscall) ia32_badsys: movq $0,ORIG_RAX-ARGOFFSET(%rsp) movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call -ni_syscall: - movq %rax,%rdi - jmp sys32_ni_syscall - quiet_ni_syscall: movq $-ENOSYS,%rax ret @@ -354,6 +373,7 @@ ENTRY(ia32_ptregs_common) popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET CFI_REL_OFFSET rax,RAX-ARGOFFSET CFI_REL_OFFSET rcx,RCX-ARGOFFSET @@ -370,10 +390,10 @@ ENTRY(ia32_ptregs_common) RESTORE_REST jmp ia32_sysret /* misbalances the return cache */ CFI_ENDPROC +END(ia32_ptregs_common) .section .rodata,"a" .align 8 - .globl ia32_sys_call_table ia32_sys_call_table: .quad sys_restart_syscall .quad sys_exit @@ -687,8 +707,8 @@ ia32_sys_call_table: .quad sys_readlinkat /* 305 */ .quad sys_fchmodat .quad sys_faccessat - .quad quiet_ni_syscall /* pselect6 for now */ - .quad quiet_ni_syscall /* ppoll for now */ + .quad compat_sys_pselect6 + .quad compat_sys_ppoll .quad sys_unshare /* 310 */ .quad compat_sys_set_robust_list .quad compat_sys_get_robust_list @@ -697,4 +717,5 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages + .quad sys_getcpu ia32_syscall_end: diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 23a4515a73b4..d18198ed636b 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -7,8 +7,6 @@ * * This allows to access 64bit processes too; but there is no way to see the extended * register contents. - * - * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $ */ #include <linux/kernel.h> @@ -27,6 +25,7 @@ #include <asm/debugreg.h> #include <asm/i387.h> #include <asm/fpu32.h> +#include <asm/ia32.h> /* * Determines which flags the user has access to [1 = access, 0 = no access]. @@ -118,6 +117,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 val) if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) return -EIO; child->thread.debugreg7 = val; + if (val) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); break; default: @@ -199,6 +202,31 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val) #undef R32 +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ + int ret; + compat_siginfo_t *si32 = (compat_siginfo_t *)compat_ptr(data); + siginfo_t ssi; + siginfo_t *si = compat_alloc_user_space(sizeof(siginfo_t)); + if (request == PTRACE_SETSIGINFO) { + memset(&ssi, 0, sizeof(siginfo_t)); + ret = copy_siginfo_from_user32(&ssi, si32); + if (ret) + return ret; + if (copy_to_user(si, &ssi, sizeof(siginfo_t))) + return -EFAULT; + } + ret = sys_ptrace(request, pid, addr, (unsigned long)si); + if (ret) + return ret; + if (request == PTRACE_GETSIGINFO) { + if (copy_from_user(&ssi, si, sizeof(siginfo_t))) + return -EFAULT; + ret = copy_siginfo_to_user32(si32, &ssi); + } + return ret; +} + asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) { struct task_struct *child; @@ -208,9 +236,19 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) __u32 val; switch (request) { - default: + case PTRACE_TRACEME: + case PTRACE_ATTACH: + case PTRACE_KILL: + case PTRACE_CONT: + case PTRACE_SINGLESTEP: + case PTRACE_DETACH: + case PTRACE_SYSCALL: + case PTRACE_SETOPTIONS: return sys_ptrace(request, pid, addr, data); + default: + return -EINVAL; + case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: case PTRACE_POKEDATA: @@ -225,10 +263,11 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_GETFPXREGS: case PTRACE_GETEVENTMSG: break; - } - if (request == PTRACE_TRACEME) - return ptrace_traceme(); + case PTRACE_SETSIGINFO: + case PTRACE_GETSIGINFO: + return ptrace32_siginfo(request, pid, addr, data); + } child = ptrace_get_task_struct(pid); if (IS_ERR(child)) @@ -336,8 +375,10 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) ret = -EIO; if (!access_ok(VERIFY_READ, u, sizeof(*u))) break; - /* no checking to be bug-to-bug compatible with i386 */ - __copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u)); + /* no checking to be bug-to-bug compatible with i386. */ + /* but silence warning */ + if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) + ; set_stopped_child_used_math(child); child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; ret = 0; @@ -349,8 +390,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) break; default: - ret = -EINVAL; - break; + BUG(); } out: diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index f182b20858e2..f280d3665f4b 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -20,7 +20,6 @@ * This should be fixed. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/fs.h> @@ -61,6 +60,7 @@ #include <linux/highuid.h> #include <linux/vmalloc.h> #include <linux/fsnotify.h> +#include <linux/sysctl.h> #include <asm/mman.h> #include <asm/types.h> #include <asm/uaccess.h> @@ -390,7 +390,9 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, } } set_fs (KERNEL_DS); - ret = sys_rt_sigprocmask(how, set ? &s : NULL, oset ? &s : NULL, + ret = sys_rt_sigprocmask(how, + set ? (sigset_t __user *)&s : NULL, + oset ? (sigset_t __user *)&s : NULL, sigsetsize); set_fs (old_fs); if (ret) return ret; @@ -508,19 +510,6 @@ sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) return compat_sys_wait4(pid, stat_addr, options, NULL); } -int sys32_ni_syscall(int call) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", - call, me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - /* 32-bit timeval and related flotsam. */ asmlinkage long @@ -555,7 +544,7 @@ sys32_sysinfo(struct sysinfo32 __user *info) int bitcount = 0; set_fs (KERNEL_DS); - ret = sys_sysinfo(&s); + ret = sys_sysinfo((struct sysinfo __user *)&s); set_fs (old_fs); /* Check to see if any memory value is too large for 32-bit and scale @@ -603,7 +592,7 @@ sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *int mm_segment_t old_fs = get_fs (); set_fs (KERNEL_DS); - ret = sys_sched_rr_get_interval(pid, &t); + ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); set_fs (old_fs); if (put_compat_timespec(&t, interval)) return -EFAULT; @@ -619,7 +608,7 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) mm_segment_t old_fs = get_fs(); set_fs (KERNEL_DS); - ret = sys_rt_sigpending(&s, sigsetsize); + ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); set_fs (old_fs); if (!ret) { switch (_NSIG_WORDS) { @@ -644,7 +633,7 @@ sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; set_fs (KERNEL_DS); - ret = sys_rt_sigqueueinfo(pid, sig, &info); + ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); set_fs (old_fs); return ret; } @@ -659,7 +648,7 @@ sys32_pause(void) } -#ifdef CONFIG_SYSCTL +#ifdef CONFIG_SYSCTL_SYSCALL struct sysctl_ia32 { unsigned int name; int nlen; @@ -680,9 +669,6 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) size_t oldlen; int __user *namep; long ret; - extern int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, - void *newval, size_t newlen); - if (copy_from_user(&a32, args32, sizeof (a32))) return -EFAULT; @@ -706,7 +692,8 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) set_fs(KERNEL_DS); lock_kernel(); - ret = do_sysctl(namep, a32.nlen, oldvalp, &oldlen, newvalp, (size_t) a32.newlen); + ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen, + newvalp, (size_t) a32.newlen); unlock_kernel(); set_fs(old_fs); @@ -757,7 +744,8 @@ sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) return -EFAULT; set_fs(KERNEL_DS); - ret = sys_sendfile(out_fd, in_fd, offset ? &of : NULL, count); + ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, + count); set_fs(old_fs); if (offset && put_user(of, offset)) @@ -792,7 +780,7 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, asmlinkage long sys32_olduname(struct oldold_utsname __user * name) { - int error; + int err; if (!name) return -EFAULT; @@ -801,27 +789,31 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - __put_user(0,name->sysname+__OLD_UTS_LEN); - __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - __put_user(0,name->nodename+__OLD_UTS_LEN); - __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - __put_user(0,name->release+__OLD_UTS_LEN); - __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - __put_user(0,name->version+__OLD_UTS_LEN); + err = __copy_to_user(&name->sysname,&system_utsname.sysname, + __OLD_UTS_LEN); + err |= __put_user(0,name->sysname+__OLD_UTS_LEN); + err |= __copy_to_user(&name->nodename,&system_utsname.nodename, + __OLD_UTS_LEN); + err |= __put_user(0,name->nodename+__OLD_UTS_LEN); + err |= __copy_to_user(&name->release,&system_utsname.release, + __OLD_UTS_LEN); + err |= __put_user(0,name->release+__OLD_UTS_LEN); + err |= __copy_to_user(&name->version,&system_utsname.version, + __OLD_UTS_LEN); + err |= __put_user(0,name->version+__OLD_UTS_LEN); { char *arch = "x86_64"; if (personality(current->personality) == PER_LINUX32) arch = "i686"; - __copy_to_user(&name->machine,arch,strlen(arch)+1); + err |= __copy_to_user(&name->machine,arch,strlen(arch)+1); } up_read(&uts_sem); - error = error ? -EFAULT : 0; + err = err ? -EFAULT : 0; - return error; + return err; } long sys32_uname(struct old_utsname __user * name) @@ -845,7 +837,7 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) seg = get_fs(); set_fs(KERNEL_DS); - ret = sys_ustat(dev,&u); + ret = sys_ustat(dev, (struct ustat __user *)&u); set_fs(seg); if (ret >= 0) { if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || @@ -916,7 +908,7 @@ long sys32_vm86_warning(void) struct task_struct *me = current; static char lastcomm[sizeof(me->comm)]; if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", + compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", me->comm); strncpy(lastcomm, me->comm, sizeof(lastcomm)); } @@ -929,13 +921,3 @@ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); } -static int __init ia32_init (void) -{ - printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n"); - return 0; -} - -__initcall(ia32_init); - -extern unsigned long ia32_sys_call_table[]; -EXPORT_SYMBOL(ia32_sys_call_table); diff --git a/arch/x86_64/ia32/vsyscall.lds b/arch/x86_64/ia32/vsyscall.lds index f2e75ed4c6c7..1dc86ff5bcb9 100644 --- a/arch/x86_64/ia32/vsyscall.lds +++ b/arch/x86_64/ia32/vsyscall.lds @@ -11,6 +11,7 @@ SECTIONS . = VSYSCALL_BASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 059c88313f4e..3c7cbff04d3d 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,9 +8,10 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o + pci-dma.o pci-nommu.o alternative.o -obj-$(CONFIG_X86_MCE) += mce.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ @@ -19,8 +20,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \ +obj-y += apic.o nmi.o +obj-y += io_apic.o mpparse.o \ genapic.o genapic_cluster.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o @@ -28,19 +29,24 @@ obj-$(CONFIG_PM) += suspend.o obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o +obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o obj-$(CONFIG_X86_VSMP) += vsmp.o +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o CFLAGS_vsyscall.o := $(PROFILING) -g0 +therm_throt-y += ../../i386/kernel/cpu/mcheck/therm_throt.o bootflag-y += ../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o @@ -49,3 +55,4 @@ intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o +alternative-y += ../../i386/kernel/alternative.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 091bc79c888f..5ebf62c7a3d2 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -26,7 +26,6 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 70b9d21ed675..b487396c4c5b 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -8,9 +8,7 @@ * because only the bootmem allocator can allocate 32+MB. * * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $ */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -19,11 +17,13 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/bitops.h> +#include <linux/ioport.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/proto.h> #include <asm/pci-direct.h> #include <asm/dma.h> +#include <asm/k8.h> int iommu_aperture; int iommu_aperture_disabled __initdata = 0; @@ -34,11 +34,21 @@ int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; +static struct resource gart_resource = { + .name = "GART", + .flags = IORESOURCE_MEM, +}; + +static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) +{ + gart_resource.start = aper_base; + gart_resource.end = aper_base + aper_size - 1; + insert_resource(&iomem_resource, &gart_resource); +} + /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ -#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) - static u32 __init allocate_aperture(void) { pg_data_t *nd0 = NODE_DATA(0); @@ -51,7 +61,7 @@ static u32 __init allocate_aperture(void) /* * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chances to find a place in the lower 4GB of memory. + * have much chance of finding a place in the lower 4GB of memory. * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ @@ -65,23 +75,24 @@ static u32 __init allocate_aperture(void) } printk("Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, __pa(p)); + insert_aperture_resource((u32)__pa(p), aper_size); return (u32)__pa(p); } -static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size) +static int __init aperture_valid(u64 aper_base, u32 aper_size) { if (!aper_base) return 0; if (aper_size < 64*1024*1024) { - printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20); + printk("Aperture too small (%d MB)\n", aper_size>>20); return 0; } if (aper_base + aper_size >= 0xffffffff) { - printk("Aperture from %s beyond 4GB. Ignoring.\n",name); + printk("Aperture beyond 4GB. Ignoring.\n"); return 0; } if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); + printk("Aperture pointing to e820 RAM. Ignoring.\n"); return 0; } return 1; @@ -140,7 +151,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", aper, 32 << *order, apsizereg); - if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) + if (!aperture_valid(aper, (32*1024*1024) << *order)) return 0; return (u32)aper; } @@ -201,17 +212,17 @@ void __init iommu_hole_init(void) u64 aper_base, last_aper_base = 0; int valid_agp = 0; - if (iommu_aperture_disabled || !fix_aperture) + if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; printk("Checking aperture...\n"); fix = 0; for (num = 24; num < 32; num++) { - char name[30]; - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) - continue; + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + iommu_detected = 1; iommu_aperture = 1; aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; @@ -222,9 +233,7 @@ void __init iommu_hole_init(void) printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, aper_base, aper_size>>20); - sprintf(name, "northbridge cpu %d", num-24); - - if (!aperture_valid(name, aper_base, aper_size)) { + if (!aperture_valid(aper_base, aper_size)) { fix = 1; break; } @@ -238,8 +247,13 @@ void __init iommu_hole_init(void) last_aper_base = aper_base; } - if (!fix && !fallback_aper_force) + if (!fix && !fallback_aper_force) { + if (last_aper_base) { + unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); + } return; + } if (!fallback_aper_force) aper_alloc = search_agp_bridge(&aper_order, &valid_agp); @@ -273,7 +287,7 @@ void __init iommu_hole_init(void) /* Fix up the north bridges */ for (num = 24; num < 32; num++) { - if (read_pci_config(0, num, 3, 0x00) != NB_ID_3) + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; /* Don't enable translation yet. That is done later. diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 29ef99001e05..135ff25e6b44 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -14,7 +14,6 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/config.h> #include <linux/init.h> #include <linux/mm.h> @@ -26,6 +25,7 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/module.h> +#include <linux/ioport.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -37,13 +37,20 @@ #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> +#include <asm/apic.h> +int apic_mapped; int apic_verbosity; int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -100,7 +107,7 @@ void clear_local_APIC(void) maxlvt = get_maxlvt(); /* - * Masking an LVT entry on a P6 can trigger a local APIC error + * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { @@ -137,72 +144,40 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void __init connect_bsp_APIC(void) -{ - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -} - void disconnect_bsp_APIC(int virt_wire_setup) { - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_QUIET, "disabling APIC mode, entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void disable_local_APIC(void) @@ -298,8 +273,6 @@ void __init sync_Arb_IDs(void) | APIC_DM_INIT); } -extern void __error_in_apic_c (void); - /* * An initial setup of the virtual wire mode. */ @@ -346,8 +319,7 @@ void __cpuinit setup_local_APIC (void) value = apic_read(APIC_LVR); - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); /* * Double-check whether this APIC is really registered. @@ -400,32 +372,8 @@ void __cpuinit setup_local_APIC (void) */ value |= APIC_SPIV_APIC_ENABLED; - /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro - */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; -#endif + /* We always use processor focus */ + /* * Set spurious IRQ vector */ @@ -443,7 +391,7 @@ void __cpuinit setup_local_APIC (void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { + if (!smp_processor_id() && !value) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", smp_processor_id()); } else { @@ -480,8 +428,7 @@ void __cpuinit setup_local_APIC (void) } nmi_watchdog_default(); - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } @@ -528,8 +475,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; @@ -607,18 +553,24 @@ static void apic_pm_activate(void) { } static int __init apic_set_verbosity(char *str) { + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; + } if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - else + else { printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug", str); + " use apic=verbose or apic=debug\n", str); + return -EINVAL; + } - return 1; + return 0; } - -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); /* * Detect and enable local APICs on non-SMP boards. @@ -639,6 +591,40 @@ static int __init detect_init_APIC (void) return 0; } +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + res = alloc_bootmem(n); + + if (!res) + return NULL; + + memset(res, 0, n); + mem = (void *)&res[nr_ioapics]; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + + return res; +} +#endif + void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -655,19 +641,26 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); -#ifdef CONFIG_X86_IO_APIC { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; @@ -679,9 +672,15 @@ void __init init_apic_mappings(void) apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; + + if (ioapic_res) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + insert_resource(&iomem_resource, ioapic_res); + ioapic_res++; + } } } -#endif } /* @@ -851,7 +850,18 @@ void disable_APIC_timer(void) unsigned long v; v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + /* + * When an illegal vector value (0-15) is written to an LVT + * entry and delivery mode is Fixed, the APIC may signal an + * illegal vector error, with out regard to whether the mask + * bit is set or whether an interrupt is actually seen on input. + * + * Boot sequence might call this function when the LVTT has + * '0' vector value. So make sure vector field is set to + * valid value. + */ + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); } } @@ -909,15 +919,13 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -#ifdef CONFIG_X86_MCE_AMD -void setup_threshold_lvt(unsigned long lvt_off) +void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { - unsigned int v = 0; - unsigned long reg = (lvt_off << 4) + 0x500; - v |= THRESHOLD_APIC_VECTOR; + unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; apic_write(reg, v); } -#endif /* CONFIG_X86_MCE_AMD */ #undef APIC_DIVISOR @@ -943,7 +951,7 @@ void smp_local_timer_interrupt(struct pt_regs *regs) * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). * - * we might want to decouple profiling from the 'long path', + * We might want to decouple profiling from the 'long path', * and do the profiling totally in assembly. * * Currently this isn't too much of an issue (performance wise), @@ -983,7 +991,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) } /* - * oem_force_hpet_timer -- force HPET mode for some boxes. + * apic_is_clustered_box() -- Check if we can expect good TSC * * Thus far, the major user of this is IBM's Summit2 series: * @@ -991,7 +999,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * multi-chassis. Use available data to take a good guess. * If in doubt, go HPET. */ -__cpuinit int oem_force_hpet_timer(void) +__cpuinit int apic_is_clustered_box(void) { int i, clusters, zeros; unsigned id; @@ -1022,8 +1030,7 @@ __cpuinit int oem_force_hpet_timer(void) } /* - * If clusters > 2, then should be multi-chassis. Return 1 for HPET. - * Else return 0 to use TSC. + * If clusters > 2, then should be multi-chassis. * May have to revisit this when multi-core + hyperthreaded CPUs come * out, but AFAIK this will work even for them. */ @@ -1116,19 +1123,15 @@ int __init APIC_init_uniprocessor (void) verify_local_APIC(); - connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); setup_local_APIC(); -#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + setup_IO_APIC(); else nr_ioapics = 0; -#endif setup_boot_APIC_clock(); check_nmi_watchdog(); return 0; @@ -1137,14 +1140,17 @@ int __init APIC_init_uniprocessor (void) static __init int setup_disableapic(char *str) { disable_apic = 1; - return 1; -} + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("disableapic", setup_disableapic); +/* same as disableapic, for compatibility */ static __init int setup_nolapic(char *str) { - disable_apic = 1; - return 1; + return setup_disableapic(str); } +early_param("nolapic", setup_nolapic); static __init int setup_noapictimer(char *str) { @@ -1177,11 +1183,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -/* dummy parsing: see setup.c */ - -__setup("disableapic", setup_disableapic); -__setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ - __setup("noapictimer", setup_noapictimer); -/* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 38834bbbae11..96687e2beb2c 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * and format the required data. */ +#include <linux/crypto.h> #include <linux/sched.h> #include <linux/stddef.h> #include <linux/errno.h> @@ -68,5 +69,7 @@ int main(void) DEFINE(pbe_next, offsetof(struct pbe, next)); BLANK(); DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); + BLANK(); + DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); return 0; } diff --git a/arch/x86_64/kernel/audit.c b/arch/x86_64/kernel/audit.c new file mode 100644 index 000000000000..21f33387bef3 --- /dev/null +++ b/arch/x86_64/kernel/audit.c @@ -0,0 +1,64 @@ +#include <linux/init.h> +#include <linux/types.h> +#include <linux/audit.h> +#include <asm/unistd.h> + +static unsigned dir_class[] = { +#include <asm-generic/audit_dir_write.h> +~0U +}; + +static unsigned read_class[] = { +#include <asm-generic/audit_read.h> +~0U +}; + +static unsigned write_class[] = { +#include <asm-generic/audit_write.h> +~0U +}; + +static unsigned chattr_class[] = { +#include <asm-generic/audit_change_attr.h> +~0U +}; + +int audit_classify_syscall(int abi, unsigned syscall) +{ +#ifdef CONFIG_IA32_EMULATION + extern int ia32_classify_syscall(unsigned); + if (abi == AUDIT_ARCH_I386) + return ia32_classify_syscall(syscall); +#endif + switch(syscall) { + case __NR_open: + return 2; + case __NR_openat: + return 3; + case __NR_execve: + return 5; + default: + return 0; + } +} + +static int __init audit_classes_init(void) +{ +#ifdef CONFIG_IA32_EMULATION + extern __u32 ia32_dir_class[]; + extern __u32 ia32_write_class[]; + extern __u32 ia32_read_class[]; + extern __u32 ia32_chattr_class[]; + audit_register_class(AUDIT_CLASS_WRITE_32, ia32_write_class); + audit_register_class(AUDIT_CLASS_READ_32, ia32_read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE_32, ia32_dir_class); + audit_register_class(AUDIT_CLASS_CHATTR_32, ia32_chattr_class); +#endif + audit_register_class(AUDIT_CLASS_WRITE, write_class); + audit_register_class(AUDIT_CLASS_READ, read_class); + audit_register_class(AUDIT_CLASS_DIR_WRITE, dir_class); + audit_register_class(AUDIT_CLASS_CHATTR, chattr_class); + return 0; +} + +__initcall(audit_classes_init); diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 4e6c3b729e39..3525f884af82 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -23,6 +23,7 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/mach_apic.h> +#include <asm/kdebug.h> /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; @@ -68,7 +69,7 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu) * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide - * all of that that no need to invent something new. + * all of that, no need to invent something new. */ buf = (u32*)per_cpu_ptr(crash_notes, cpu); @@ -95,15 +96,25 @@ static void crash_save_self(struct pt_regs *regs) #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + /* * Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); crash_save_this_cpu(regs, cpu); @@ -111,14 +122,14 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ for(;;) - asm("hlt"); + halt(); return 1; } static void smp_send_nmi_allbutself(void) { - send_IPI_allbutself(APIC_DM_NMI); + send_IPI_allbutself(NMI_VECTOR); } /* @@ -127,12 +138,17 @@ static void smp_send_nmi_allbutself(void) * cpu hotplug shouldn't matter. */ +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* * Ensure the new callback function is set before sending @@ -161,7 +177,7 @@ void machine_crash_shutdown(struct pt_regs *regs) { /* * This function is only called after the system - * has paniced or is otherwise in a critical state. + * has panicked or is otherwise in a critical state. * The minimum amount of code to allow a kexec'd kernel * to run successfully needs to happen here. * @@ -178,9 +194,7 @@ void machine_crash_shutdown(struct pt_regs *regs) if(cpu_has_apic) disable_local_APIC(); -#if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); -#endif crash_save_self(regs); } diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1ef6028f721e..b3f0908668ec 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -1,7 +1,6 @@ /* * Handle the memory map. * The functions here do the job until bootmem takes over. - * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ * * Getting sanitize_e820_map() in sync with i386 version by applying change: * - Provisions for empty E820 memory regions (reported by certain BIOSes). @@ -9,7 +8,6 @@ * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> * */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> @@ -18,13 +16,17 @@ #include <linux/string.h> #include <linux/kexec.h> #include <linux/module.h> +#include <linux/mm.h> +#include <asm/pgtable.h> #include <asm/page.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/bootsetup.h> #include <asm/sections.h> +struct e820map e820 __initdata; + /* * PFN of last memory page. */ @@ -41,7 +43,7 @@ unsigned long end_pfn_map; /* * Last pfn which the user wants to use. */ -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; +static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; extern struct resource code_resource, data_resource; @@ -70,9 +72,8 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) return 1; } #endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < __pa_symbol(&_end)) { + /* kernel code */ + if (last >= __pa_symbol(&_text) && last < __pa_symbol(&_end)) { *addrp = __pa_symbol(&_end); return 1; } @@ -161,59 +162,14 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsi return -1UL; } -/* - * Free bootmem based on the e820 table for a node. - */ -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr && last-addr >= PAGE_SIZE) - free_bootmem_node(pgdat, addr, last-addr); - } -} - /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - int i; unsigned long end_pfn = 0; + end_pfn = find_max_pfn_with_active_regions(); - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_up(ei->addr, PAGE_SIZE); - end = round_down(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RAM) { - if (end > end_pfn<<PAGE_SHIFT) - end_pfn = end>>PAGE_SHIFT; - } else { - if (end > end_pfn_map<<PAGE_SHIFT) - end_pfn_map = end>>PAGE_SHIFT; - } - } - if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) @@ -223,43 +179,10 @@ unsigned long __init e820_end_of_ram(void) if (end_pfn > end_pfn_map) end_pfn = end_pfn_map; + printk("end_pfn_map = %lu\n", end_pfn_map); return end_pfn; } -/* - * Compute how much memory is missing in a range. - * Unlike the other functions in this file the arguments are in page numbers. - */ -unsigned long __init -e820_hole_size(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long ram = 0; - unsigned long start = start_pfn << PAGE_SHIFT; - unsigned long end = end_pfn << PAGE_SHIFT; - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram) >> PAGE_SHIFT; -} - /* * Mark e820 reserved areas as busy for the resource manager. */ @@ -294,6 +217,96 @@ void __init e820_reserve_resources(void) } } +/* Mark pages corresponding to given address range as nosave */ +static void __init +e820_mark_nosave_range(unsigned long start, unsigned long end) +{ + unsigned long pfn, max_pfn; + + if (start >= end) + return; + + printk("Nosave address range: %016lx - %016lx\n", start, end); + max_pfn = end >> PAGE_SHIFT; + for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++) + if (pfn_valid(pfn)) + SetPageNosave(pfn_to_page(pfn)); +} + +/* + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for software + * suspend and suspend to RAM. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long paddr; + + paddr = round_down(e820.map[0].addr + e820.map[0].size, PAGE_SIZE); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (paddr < ei->addr) + e820_mark_nosave_range(paddr, + round_up(ei->addr, PAGE_SIZE)); + + paddr = round_down(ei->addr + ei->size, PAGE_SIZE); + if (ei->type != E820_RAM) + e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE), + paddr); + + if (paddr >= (end_pfn << PAGE_SHIFT)) + break; + } +} + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + unsigned long ei_startpfn, ei_endpfn; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) + >> PAGE_SHIFT; + + /* Skip map entries smaller than a page */ + if (ei_startpfn > ei_endpfn) + continue; + + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) + end_pfn_map = ei_endpfn; + + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || + ei_endpfn <= start_pfn || + ei_startpfn >= end_pfn) + continue; + + /* Check for overlaps */ + if (ei_startpfn < start_pfn) + ei_startpfn = start_pfn; + if (ei_endpfn > end_pfn) + ei_endpfn = end_pfn; + + /* Obey end_user_pfn to save on memmap */ + if (ei_startpfn >= end_user_pfn) + continue; + if (ei_endpfn > end_user_pfn) + ei_endpfn = end_user_pfn; + + add_active_range(nid, ei_startpfn, ei_endpfn); + } +} + /* * Add a memory region to the kernel e820 map. */ @@ -514,13 +527,6 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * If we're lucky and live on a modern system, the setup code * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) */ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { @@ -538,34 +544,19 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) if (start > end) return -1; - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); } while (biosmap++,--nr_map); return 0; } -void __init setup_memory_region(void) +void early_panic(char *msg) { - char *who = "BIOS-e820"; + early_printk(msg); + panic(msg); +} +void __init setup_memory_region(void) +{ /* * Try to copy the BIOS-supplied E820-map. * @@ -573,54 +564,74 @@ void __init setup_memory_region(void) * the next section from 1mb->appropriate_mem_k */ sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) + early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); + e820_print_map("BIOS-e820"); } -void __init parse_memopt(char *p, char **from) -{ - end_user_pfn = memparse(p, from); +static int __init parse_memopt(char *p) +{ + if (!p) + return -EINVAL; + end_user_pfn = memparse(p, &p); end_user_pfn >>= PAGE_SHIFT; + return 0; } +early_param("mem", parse_memopt); + +static int userdef __initdata; -void __init parse_memmapopt(char *p, char **from) +static int __init parse_memmap_opt(char *p) { + char *oldp; unsigned long long start_at, mem_size; - mem_size = memparse(p, from); - p = *from; + if (!strcmp(p, "exactmap")) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + saved_max_pfn = e820_end_of_ram(); +#endif + end_pfn_map = 0; + e820.nr_map = 0; + userdef = 1; + return 0; + } + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; if (*p == '@') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); } else if (*p == '#') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_ACPI); } else if (*p == '$') { - start_at = memparse(p+1, from); + start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RESERVED); } else { end_user_pfn = (mem_size >> PAGE_SHIFT); } - p = *from; + return *p == '\0' ? 0 : -EINVAL; +} +early_param("memmap", parse_memmap_opt); + +void finish_e820_parsing(void) +{ + if (userdef) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + e820_print_map("user"); + } } unsigned long pci_mem_start = 0xaeedbabe; +EXPORT_SYMBOL(pci_mem_start); /* * Search for the biggest gap in the low 32 bits of the e820 diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c new file mode 100644 index 000000000000..208e38a372c1 --- /dev/null +++ b/arch/x86_64/kernel/early-quirks.c @@ -0,0 +1,122 @@ +/* Various workarounds for chipset bugs. + This code runs very early and can't use the regular PCI subsystem + The entries are keyed to PCI bridges which usually identify chipsets + uniquely. + This is only for whole classes of chipsets with specific problems which + need early invasive action (e.g. before the timers are initialized). + Most PCI device specific workarounds can be done later and should be + in standard PCI quirks + Mainboard specific bugs should be handled by DMI entries. + CPU specific bugs in setup.c */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/pci_ids.h> +#include <asm/pci-direct.h> +#include <asm/proto.h> +#include <asm/dma.h> + +static void via_bugs(void) +{ +#ifdef CONFIG_IOMMU + if ((end_pfn > MAX_DMA32_PFN || force_iommu) && + !iommu_aperture_allowed) { + printk(KERN_INFO + "Looks like a VIA chipset. Disabling IOMMU. Override with iommu=allowed\n"); + iommu_aperture_disabled = 1; + } +#endif +} + +#ifdef CONFIG_ACPI + +static int nvidia_hpet_detected __initdata; + +static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) +{ + nvidia_hpet_detected = 1; + return 0; +} +#endif + +static void nvidia_bugs(void) +{ +#ifdef CONFIG_ACPI + /* + * All timer overrides on Nvidia are + * wrong unless HPET is enabled. + */ + nvidia_hpet_detected = 0; + acpi_table_parse(ACPI_HPET, nvidia_hpet_check); + if (nvidia_hpet_detected == 0) { + acpi_skip_timer_override = 1; + printk(KERN_INFO "Nvidia board " + "detected. Ignoring ACPI " + "timer override.\n"); + } +#endif + /* RED-PEN skip them on mptables too? */ + +} + +static void ati_bugs(void) +{ +#if 1 /* for testing */ + printk("ATI board detected\n"); +#endif + /* No bugs right now */ +} + +struct chipset { + u16 vendor; + void (*f)(void); +}; + +static struct chipset early_qrk[] = { + { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, via_bugs }, + { PCI_VENDOR_ID_ATI, ati_bugs }, + {} +}; + +void __init early_quirks(void) +{ + int num, slot, func; + + if (!early_pci_allowed()) + return; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + u8 type; + int i; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + + for (i = 0; early_qrk[i].f; i++) + if (early_qrk[i].vendor == vendor) { + early_qrk[i].f(); + return; + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index b93ef5b51980..e22ecd54870d 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -2,7 +2,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/string.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> @@ -215,20 +215,16 @@ void early_printk(const char *fmt, ...) static int __initdata keep_early; -int __init setup_early_printk(char *opt) +static int __init setup_early_printk(char *buf) { - char *space; - char buf[256]; + if (!buf) + return 0; if (early_console_initialized) - return 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; + return 0; + early_console_initialized = 1; - if (strstr(buf,"keep")) + if (!strcmp(buf,"keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { @@ -248,11 +244,12 @@ int __init setup_early_printk(char *opt) early_console = &simnow_console; keep_early = 1; } - early_console_initialized = 1; register_console(early_console); return 0; } +early_param("earlyprintk", setup_early_printk); + void __init disable_early_printk(void) { if (!early_console_initialized || !early_console) @@ -266,4 +263,3 @@ void __init disable_early_printk(void) } } -__setup("earlyprintk=", setup_early_printk); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 586b34c00c48..2802524104f3 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -4,8 +4,6 @@ * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> - * - * $Id$ */ /* @@ -22,16 +20,25 @@ * at the top of the kernel process stack. * - partial stack frame: partially saved registers upto R11. * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. + * + * Some macro usage: + * - CFI macros are used to generate dwarf2 unwind information for better + * backtraces. They don't change any code. + * - SAVE_ALL/RESTORE_ALL - Save/restore all registers + * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. + * There are unfortunately lots of special cases where some registers + * not touched. The macro is a big mess that should be cleaned up. + * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. + * Gives a full stack frame. + * - ENTRY/END Define functions in the symbol table. + * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack + * frame that is otherwise undefined after a SYSCALL + * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. + * - errorentry/paranoidentry/zeroentry - Define exception entry points. */ -#define ASSEMBLY 1 -#include <linux/config.h> #include <linux/linkage.h> #include <asm/segment.h> -#include <asm/smp.h> #include <asm/cache.h> #include <asm/errno.h> #include <asm/dwarf2.h> @@ -42,13 +49,24 @@ #include <asm/thread_info.h> #include <asm/hw_irq.h> #include <asm/page.h> +#include <asm/irqflags.h> .code64 #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif - + + +.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +#ifdef CONFIG_TRACE_IRQFLAGS + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON +1: +#endif +.endm + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -105,6 +123,7 @@ .macro CFI_DEFAULT_STACK start=1 .if \start CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8 .else CFI_DEF_CFA_OFFSET SS+8 @@ -136,6 +155,10 @@ /* rdi: prev */ ENTRY(ret_from_fork) CFI_DEFAULT_STACK + push kernel_eflags(%rip) + CFI_ADJUST_CFA_OFFSET 4 + popf # reset kernel eflags + CFI_ADJUST_CFA_OFFSET -4 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) @@ -154,6 +177,7 @@ rff_trace: GET_THREAD_INFO(%rcx) jmp rff_action CFI_ENDPROC +END(ret_from_fork) /* * System call entry. Upto 6 arguments in registers are supported. @@ -188,12 +212,17 @@ rff_trace: ENTRY(system_call) CFI_STARTPROC simple - CFI_DEF_CFA rsp,0 + CFI_SIGNAL_FRAME + CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) @@ -219,10 +248,15 @@ ret_from_sys_call: sysret_check: GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx CFI_REMEMBER_STATE jnz sysret_careful + /* + * sysretq will re-enable interrupts: + */ + TRACE_IRQS_ON movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 @@ -237,6 +271,7 @@ sysret_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -247,6 +282,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: + TRACE_IRQS_ON sti testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz 1f @@ -261,6 +297,7 @@ sysret_signal: /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli + TRACE_IRQS_OFF jmp int_with_check badsys: @@ -285,6 +322,7 @@ tracesys: /* Use IRET because user could have changed frame */ jmp int_ret_from_sys_call CFI_ENDPROC +END(system_call) /* * Syscall return path ending with IRET. @@ -292,6 +330,7 @@ tracesys: */ ENTRY(int_ret_from_sys_call) CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-ARGOFFSET /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET @@ -308,6 +347,7 @@ ENTRY(int_ret_from_sys_call) CFI_REL_OFFSET r10,R10-ARGOFFSET CFI_REL_OFFSET r11,R11-ARGOFFSET cli + TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi @@ -326,6 +366,7 @@ int_with_check: int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -333,10 +374,12 @@ int_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 cli + TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: + TRACE_IRQS_ON sti SAVE_REST /* Check for syscall exit trace */ @@ -350,6 +393,7 @@ int_very_careful: CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli + TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -362,8 +406,10 @@ int_signal: int_restore_rest: RESTORE_REST cli + TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC +END(int_ret_from_sys_call) /* * Certain special system calls that need to save a complete full stack frame. @@ -375,6 +421,7 @@ int_restore_rest: leaq \func(%rip),%rax leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ptregscall_common +END(\label) .endm CFI_STARTPROC @@ -404,6 +451,7 @@ ENTRY(ptregscall_common) CFI_REL_OFFSET rip, 0 ret CFI_ENDPROC +END(ptregscall_common) ENTRY(stub_execve) CFI_STARTPROC @@ -418,6 +466,7 @@ ENTRY(stub_execve) RESTORE_REST jmp int_ret_from_sys_call CFI_ENDPROC +END(stub_execve) /* * sigreturn is special because it needs to restore all registers on return. @@ -435,12 +484,14 @@ ENTRY(stub_rt_sigreturn) RESTORE_REST jmp int_ret_from_sys_call CFI_ENDPROC +END(stub_rt_sigreturn) /* * initial frame state for interrupts and exceptions */ .macro _frame ref CFI_STARTPROC simple + CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,SS+8-\ref /*CFI_REL_OFFSET ss,SS-\ref*/ CFI_REL_OFFSET rsp,RSP-\ref @@ -466,29 +517,28 @@ ENTRY(stub_rt_sigreturn) /* 0(%rsp): interrupt number */ .macro interrupt func cld -#ifdef CONFIG_DEBUG_INFO - SAVE_ALL - movq %rsp,%rdi - /* - * Setup a stack frame pointer. This allows gdb to trace - * back to the original stack. - */ - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp -#else SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler -#endif + pushq %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp, 0 + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp testl $3,CS(%rdi) je 1f swapgs -1: incl %gs:pda_irqcount # RED-PEN should check preempt count - movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp /*todo This needs CFI annotation! */ - pushq %rdi # save old stack -#ifndef CONFIG_DEBUG_INFO - CFI_ADJUST_CFA_OFFSET 8 -#endif + /* irqcount is used to check if a CPU is already on an interrupt + stack or not. While this is essentially redundant with preempt_count + it is a little cheaper to use a separate counter in the PDA + (short of moving irq_enter into assembly, which would be too + much work) */ +1: incl %gs:pda_irqcount + cmoveq %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder + /* + * We entered an interrupt context - irqs are off: + */ + TRACE_IRQS_OFF call \func .endm @@ -497,17 +547,12 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - popq %rdi -#ifndef CONFIG_DEBUG_INFO - CFI_ADJUST_CFA_OFFSET -8 -#endif cli + TRACE_IRQS_OFF decl %gs:pda_irqcount -#ifdef CONFIG_DEBUG_INFO - movq RBP(%rdi),%rbp + leaveq CFI_DEF_CFA_REGISTER rsp -#endif - leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */ + CFI_ADJUST_CFA_OFFSET -8 exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) @@ -526,9 +571,21 @@ retint_check: CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: + /* + * The iretq could re-enable interrupts: + */ + cli + TRACE_IRQS_IRETQ swapgs + jmp restore_args + retint_restore_args: cli + /* + * The iretq could re-enable interrupts: + */ + TRACE_IRQS_IRETQ +restore_args: RESTORE_ARGS 0,8,0 iret_label: iretq @@ -541,6 +598,7 @@ iret_label: /* running with kernel gs */ bad_iret: movq $11,%rdi /* SIGSEGV */ + TRACE_IRQS_ON sti jmp do_exit .previous @@ -550,6 +608,7 @@ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal + TRACE_IRQS_ON sti pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -558,11 +617,13 @@ retint_careful: CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli + TRACE_IRQS_OFF jmp retint_check retint_signal: testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx jz retint_swapgs + TRACE_IRQS_ON sti SAVE_REST movq $-1,ORIG_RAX(%rsp) @@ -571,6 +632,7 @@ retint_signal: call do_notify_resume RESTORE_REST cli + TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -578,8 +640,7 @@ retint_signal: #ifdef CONFIG_PREEMPT /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: +ENTRY(retint_kernel) cmpl $0,threadinfo_preempt_count(%rcx) jnz retint_restore_args bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) @@ -589,14 +650,16 @@ retint_kernel: call preempt_schedule_irq jmp exit_intr #endif + CFI_ENDPROC +END(common_interrupt) /* * APIC interrupts. */ .macro apicinterrupt num,func INTR_FRAME - pushq $\num-256 + pushq $~(\num) CFI_ADJUST_CFA_OFFSET 8 interrupt \func jmp ret_from_intr @@ -605,17 +668,21 @@ retint_kernel: ENTRY(thermal_interrupt) apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +END(thermal_interrupt) ENTRY(threshold_interrupt) apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt +END(threshold_interrupt) #ifdef CONFIG_SMP ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt +END(reschedule_interrupt) .macro INVALIDATE_ENTRY num ENTRY(invalidate_interrupt\num) apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt +END(invalidate_interrupt\num) .endm INVALIDATE_ENTRY 0 @@ -629,18 +696,20 @@ ENTRY(invalidate_interrupt\num) ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +END(call_function_interrupt) #endif -#ifdef CONFIG_X86_LOCAL_APIC ENTRY(apic_timer_interrupt) apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt +END(apic_timer_interrupt) ENTRY(error_interrupt) apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt +END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -#endif +END(spurious_interrupt) /* * Exception entry points. @@ -667,7 +736,7 @@ ENTRY(spurious_interrupt) /* error code is on the stack already */ /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym, ist=0 + .macro paranoidentry sym, ist=0, irqtrace=1 SAVE_ALL cld movl $1,%ebx @@ -692,13 +761,80 @@ ENTRY(spurious_interrupt) addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif cli + .if \irqtrace + TRACE_IRQS_OFF + .endif .endm - + + /* + * "Paranoid" exit path from exception stack. + * Paranoid because this is used by NMIs and cannot take + * any kernel state for granted. + * We don't do kernel preemption checks here, because only + * NMI should be common and it does not enable IRQs and + * cannot get reschedule ticks. + * + * "trace" is 0 for the NMI handler only, because irq-tracing + * is fundamentally NMI-unsafe. (we cannot change the soft and + * hard flags at once, atomically) + */ + .macro paranoidexit trace=1 + /* ebx: no swapgs flag */ +paranoid_exit\trace: + testl %ebx,%ebx /* swapgs needed? */ + jnz paranoid_restore\trace + testl $3,CS(%rsp) + jnz paranoid_userspace\trace +paranoid_swapgs\trace: + .if \trace + TRACE_IRQS_IRETQ 0 + .endif + swapgs +paranoid_restore\trace: + RESTORE_ALL 8 + iretq +paranoid_userspace\trace: + GET_THREAD_INFO(%rcx) + movl threadinfo_flags(%rcx),%ebx + andl $_TIF_WORK_MASK,%ebx + jz paranoid_swapgs\trace + movq %rsp,%rdi /* &pt_regs */ + call sync_regs + movq %rax,%rsp /* switch stack for scheduling */ + testl $_TIF_NEED_RESCHED,%ebx + jnz paranoid_schedule\trace + movl %ebx,%edx /* arg3: thread flags */ + .if \trace + TRACE_IRQS_ON + .endif + sti + xorl %esi,%esi /* arg2: oldset */ + movq %rsp,%rdi /* arg1: &pt_regs */ + call do_notify_resume + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace +paranoid_schedule\trace: + .if \trace + TRACE_IRQS_ON + .endif + sti + call schedule + cli + .if \trace + TRACE_IRQS_OFF + .endif + jmp paranoid_userspace\trace + CFI_ENDPROC + .endm + /* * Exception entry point. This expects an error code/orig_rax on the stack * and the exception handler in %rax. */ -ENTRY(error_entry) +KPROBE_ENTRY(error_entry) _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld @@ -749,6 +885,7 @@ error_exit: movl %ebx,%eax RESTORE_REST cli + TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax jne retint_kernel @@ -756,6 +893,10 @@ error_exit: movl $_TIF_WORK_MASK,%edi andl %edi,%edx jnz retint_careful + /* + * The iret might restore flags: + */ + TRACE_IRQS_IRETQ swapgs RESTORE_ARGS 0,8,0 jmp iret_label @@ -777,6 +918,7 @@ error_kernelspace: cmpq $gs_change,RIP(%rsp) je error_swapgs jmp error_sti +KPROBE_END(error_entry) /* Reload gs selector with exception handling */ /* edi: new selector */ @@ -794,6 +936,7 @@ gs_change: CFI_ADJUST_CFA_OFFSET -8 ret CFI_ENDPROC +ENDPROC(load_gs_index) .section __ex_table,"a" .align 8 @@ -847,9 +990,11 @@ ENTRY(kernel_thread) UNFAKE_STACK_FRAME ret CFI_ENDPROC - +ENDPROC(kernel_thread) child_rip: + pushq $0 # fake return address + CFI_STARTPROC /* * Here we are in the child and the registers are set as they were * at kernel_thread() invocation in the parent. @@ -860,6 +1005,8 @@ child_rip: # exit xorl %edi, %edi call do_exit + CFI_ENDPROC +ENDPROC(child_rip) /* * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -889,19 +1036,23 @@ ENTRY(execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC +ENDPROC(execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault - .previous .text +KPROBE_END(page_fault) ENTRY(coprocessor_error) zeroentry do_coprocessor_error +END(coprocessor_error) ENTRY(simd_coprocessor_error) zeroentry do_simd_coprocessor_error +END(simd_coprocessor_error) ENTRY(device_not_available) zeroentry math_state_restore +END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) @@ -909,116 +1060,91 @@ KPROBE_ENTRY(debug) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK - jmp paranoid_exit - CFI_ENDPROC - .previous .text + paranoidexit +KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* - * "Paranoid" exit path from exception stack. - * Paranoid because this is used by NMIs and cannot take - * any kernel state for granted. - * We don't do kernel preemption checks here, because only - * NMI should be common and it does not enable IRQs and - * cannot get reschedule ticks. - */ - /* ebx: no swapgs flag */ -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - testl $3,CS(%rsp) - jnz paranoid_userspace -paranoid_swapgs: - swapgs -paranoid_restore: - RESTORE_ALL 8 - iretq -paranoid_userspace: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%ebx - andl $_TIF_WORK_MASK,%ebx - jz paranoid_swapgs - movq %rsp,%rdi /* &pt_regs */ - call sync_regs - movq %rax,%rsp /* switch stack for scheduling */ - testl $_TIF_NEED_RESCHED,%ebx - jnz paranoid_schedule - movl %ebx,%edx /* arg3: thread flags */ - sti - xorl %esi,%esi /* arg2: oldset */ - movq %rsp,%rdi /* arg1: &pt_regs */ - call do_notify_resume - cli - jmp paranoid_userspace -paranoid_schedule: - sti - call schedule - cli - jmp paranoid_userspace - CFI_ENDPROC - .previous .text + paranoidentry do_nmi, 0, 0 +#ifdef CONFIG_TRACE_IRQFLAGS + paranoidexit 0 +#else + jmp paranoid_exit1 + CFI_ENDPROC +#endif +KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC - .previous .text +KPROBE_END(int3) ENTRY(overflow) zeroentry do_overflow +END(overflow) ENTRY(bounds) zeroentry do_bounds +END(bounds) ENTRY(invalid_op) zeroentry do_invalid_op +END(invalid_op) ENTRY(coprocessor_segment_overrun) zeroentry do_coprocessor_segment_overrun +END(coprocessor_segment_overrun) ENTRY(reserved) zeroentry do_reserved +END(reserved) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME paranoidentry do_double_fault - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC +END(double_fault) ENTRY(invalid_TSS) errorentry do_invalid_TSS +END(invalid_TSS) ENTRY(segment_not_present) errorentry do_segment_not_present +END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME paranoidentry do_stack_segment - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC +END(stack_segment) KPROBE_ENTRY(general_protection) errorentry do_general_protection - .previous .text +KPROBE_END(general_protection) ENTRY(alignment_check) errorentry do_alignment_check +END(alignment_check) ENTRY(divide_error) zeroentry do_divide_error +END(divide_error) ENTRY(spurious_interrupt_bug) zeroentry do_spurious_interrupt_bug +END(spurious_interrupt_bug) #ifdef CONFIG_X86_MCE /* runs on exception stack */ @@ -1027,22 +1153,60 @@ ENTRY(machine_check) pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check - jmp paranoid_exit + jmp paranoid_exit1 CFI_ENDPROC +END(machine_check) #endif +/* Call softirq on interrupt stack. Interrupts are off. */ ENTRY(call_softirq) CFI_STARTPROC - movq %gs:pda_irqstackptr,%rax - movq %rsp,%rdx - CFI_DEF_CFA_REGISTER rdx + push %rbp + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbp,0 + mov %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp incl %gs:pda_irqcount - cmove %rax,%rsp - pushq %rdx - /*todo CFI_DEF_CFA_EXPRESSION ...*/ + cmove %gs:pda_irqstackptr,%rsp + push %rbp # backlink for old unwinder call __do_softirq - popq %rsp + leaveq CFI_DEF_CFA_REGISTER rsp + CFI_ADJUST_CFA_OFFSET -8 decl %gs:pda_irqcount ret CFI_ENDPROC +ENDPROC(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %rcx, RIP(%rdi) + leaq 8(%rsp), %rcx + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %rcx, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rdx + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist index 2bcebdc3eedb..01fa23580c85 100644 --- a/arch/x86_64/kernel/functionlist +++ b/arch/x86_64/kernel/functionlist @@ -384,7 +384,6 @@ *(.text.__end_that_request_first) *(.text.wake_up_bit) *(.text.unuse_mm) -*(.text.skb_release_data) *(.text.shrink_icache_memory) *(.text.sched_balance_self) *(.text.__pmd_alloc) diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 7a64ea181788..8e78a75d1866 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -8,7 +8,6 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/config.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index 43fcf62fef0f..cdb90e671b88 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -9,7 +9,6 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/config.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -119,7 +118,6 @@ struct genapic apic_cluster = { .name = "clustered", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = cluster_target_cpus, .apic_id_registered = cluster_apic_id_registered, .init_apic_ldr = cluster_init_apic_ldr, diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 1a2ab825be98..50ad153eaac4 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -8,7 +8,6 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ -#include <linux/config.h> #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/string.h> @@ -50,8 +49,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) unsigned long cfg; unsigned long flags; - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); /* * Wait for idle. @@ -78,22 +76,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) static void flat_send_IPI_allbutself(int vector) { -#ifndef CONFIG_HOTPLUG_CPU - if (((num_online_cpus()) - 1) >= 1) - __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +#ifdef CONFIG_HOTPLUG_CPU + int hotplug = 1; #else - cpumask_t allbutme = cpu_online_map; + int hotplug = 0; +#endif + if (hotplug || vector == NMI_VECTOR) { + cpumask_t allbutme = cpu_online_map; - cpu_clear(smp_processor_id(), allbutme); + cpu_clear(smp_processor_id(), allbutme); - if (!cpus_empty(allbutme)) - flat_send_IPI_mask(allbutme, vector); -#endif + if (!cpus_empty(allbutme)) + flat_send_IPI_mask(allbutme, vector); + } else if (num_online_cpus() > 1) { + __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); + } } static void flat_send_IPI_all(int vector) { - __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); + if (vector == NMI_VECTOR) + flat_send_IPI_mask(cpu_online_map, vector); + else + __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } static int flat_apic_id_registered(void) @@ -108,17 +113,13 @@ static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) static unsigned int phys_pkg_id(int index_msb) { - u32 ebx; - - ebx = cpuid_ebx(1); - return ((ebx >> 24) & 0xFF) >> index_msb; + return hard_smp_processor_id() >> index_msb; } struct genapic apic_flat = { .name = "flat", .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), - .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, .target_cpus = flat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr, @@ -177,7 +178,6 @@ struct genapic apic_physflat = { .name = "physical flat", .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 6df05e6034fa..1e6f80870679 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,8 +5,6 @@ * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ */ @@ -187,11 +185,15 @@ startup_64: /* Finally jump to run C code and to be on real kernel address * Since we are running on identity-mapped space we have to jump - * to the full 64bit address , this is only possible as indirect - * jump + * to the full 64bit address, this is only possible as indirect + * jump. In addition we need to ensure %cs is set so we make this + * a far return. */ movq initial_code(%rip),%rax - jmp *%rax + pushq $0 # fake return address to stop unwinder + pushq $__KERNEL_CS # set correct cs + pushq %rax # target address in negative space + lretq /* SMP bootup changes these two */ .align 8 @@ -370,7 +372,7 @@ ENTRY(cpu_gdt_table) .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused */ + .quad 0x0000f40000000000 /* node/CPU stored in limit */ gdt_end: /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index cea20a66c150..9561eb3c5b5c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -2,8 +2,6 @@ * linux/arch/x86_64/kernel/head64.c -- prepare to run common code * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * - * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ */ #include <linux/init.h> @@ -47,38 +45,16 @@ static void __init copy_bootdata(char *real_mode_data) new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); return; } new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); } command_line = (char *) ((u64)(new_data)); memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); -} - -static void __init setup_boot_cpu_data(void) -{ - unsigned int dummy, eax; - - /* get vendor info */ - cpuid(0, (unsigned int *)&boot_cpu_data.cpuid_level, - (unsigned int *)&boot_cpu_data.x86_vendor_id[0], - (unsigned int *)&boot_cpu_data.x86_vendor_id[8], - (unsigned int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; } void __init x86_64_start_kernel(char * real_mode_data) { - char *s; int i; for (i = 0; i < 256; i++) @@ -86,6 +62,8 @@ void __init x86_64_start_kernel(char * real_mode_data) asm volatile("lidt %0" :: "m" (idt_descr)); clear_bss(); + early_printk("Kernel alive\n"); + /* * switch to init_level4_pgt from boot_level4_pgt */ @@ -100,22 +78,5 @@ void __init x86_64_start_kernel(char * real_mode_data) #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(strchr(s, '=') + 1); -#ifdef CONFIG_NUMA - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif -#ifdef CONFIG_X86_IO_APIC - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; -#endif - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - - setup_boot_cpu_data(); start_kernel(); } diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index 44ddb1ec808d..3aa1e9bb781d 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c @@ -14,7 +14,6 @@ * the 64bit user space sees a FXSAVE frame directly. */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/init.h> #include <asm/processor.h> diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 5ecd34ab8c2b..2dd51f364ea2 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -1,5 +1,4 @@ #include <linux/linkage.h> -#include <linux/config.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> @@ -44,11 +43,11 @@ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ BI(x,c) BI(x,d) BI(x,e) BI(x,f) -#define BUILD_14_IRQS(x) \ +#define BUILD_15_IRQS(x) \ BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) + BI(x,c) BI(x,d) BI(x,e) /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -56,7 +55,6 @@ */ BUILD_16_IRQS(0x0) -#ifdef CONFIG_X86_LOCAL_APIC /* * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... @@ -73,13 +71,11 @@ BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) #ifdef CONFIG_PCI_MSI - BUILD_14_IRQS(0xe) -#endif - + BUILD_15_IRQS(0xe) #endif #undef BUILD_16_IRQS -#undef BUILD_14_IRQS +#undef BUILD_15_IRQS #undef BI @@ -92,26 +88,24 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -#define IRQLIST_14(x) \ +#define IRQLIST_15(x) \ IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d) + IRQ(x,c), IRQ(x,d), IRQ(x,e) void (*interrupt[NR_IRQS])(void) = { IRQLIST_16(0x0), -#ifdef CONFIG_X86_IO_APIC IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), IRQLIST_16(0xc), IRQLIST_16(0xd) #ifdef CONFIG_PCI_MSI - , IRQLIST_14(0xe) + , IRQLIST_15(0xe) #endif -#endif }; #undef IRQ @@ -129,6 +123,8 @@ void (*interrupt[NR_IRQS])(void) = { DEFINE_SPINLOCK(i8259A_lock); +static int i8259A_auto_eoi; + static void end_8259A_irq (unsigned int irq) { if (irq > 256) { @@ -235,7 +231,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - irq_desc[irq].handler = &i8259A_irq_type; + irq_desc[irq].chip = &i8259A_irq_type; enable_irq(irq); } @@ -278,7 +274,7 @@ static void mask_and_ack_8259A(unsigned int irq) * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecesserily. + * do without slowing down good hardware unnecessarily. * * Note that IRQ7 and IRQ15 (the two spurious IRQs * usually resulting from the 8259A-1|2 PICs) occur @@ -342,6 +338,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, 0x21); /* mask all of 8259A-1 */ @@ -400,7 +398,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } @@ -454,9 +452,7 @@ void __init init_ISA_irqs (void) { int i; -#ifdef CONFIG_X86_LOCAL_APIC init_bsp_APIC(); -#endif init_8259A(0); for (i = 0; i < NR_IRQS; i++) { @@ -468,12 +464,12 @@ void __init init_ISA_irqs (void) /* * 16 old-style INTA-cycle interrupts: */ - irq_desc[i].handler = &i8259A_irq_type; + irq_desc[i].chip = &i8259A_irq_type; } else { /* * 'high' PCI IRQs filled in on demand */ - irq_desc[i].handler = &no_irq_type; + irq_desc[i].chip = &no_irq_type; } } } @@ -582,14 +578,12 @@ void __init init_IRQ(void) set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); set_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); -#endif /* * Set the clock to HZ Hz, we already have a valid diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index ce31d904d601..3dc5854ba21e 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -46,4 +46,9 @@ EXPORT_SYMBOL(init_task); */ DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +/* Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 9cc7031b7151..0491019d4c8d 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> -#include <linux/config.h> #include <linux/smp_lock.h> #include <linux/mc146818rtc.h> #include <linux/acpi.h> @@ -41,6 +40,7 @@ #include <asm/mach_apic.h> #include <asm/acpi.h> #include <asm/dma.h> +#include <asm/nmi.h> #define __apicdebuginit __init @@ -48,7 +48,7 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; int timer_over_8254 __initdata = 0; @@ -56,6 +56,7 @@ int timer_over_8254 __initdata = 0; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -110,6 +111,33 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; FINAL; \ } +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { @@ -195,13 +223,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; /* @@ -209,10 +233,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -224,14 +245,6 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; int skip_ioapic_setup; int ioapic_force; @@ -240,18 +253,17 @@ int ioapic_force; static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; - return 1; + return 0; } +early_param("noapic", disable_ioapic_setup); -static int __init enable_ioapic_setup(char *str) +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) { - ioapic_force = 1; - skip_ioapic_setup = 0; + disable_timer_pin_1 = 1; return 1; } - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); +__setup("disable_timer_pin_1", disable_timer_pin_setup); static int __init setup_disable_8254_timer(char *s) { @@ -267,135 +279,6 @@ static int __init setup_enable_8254_timer(char *s) __setup("disable_8254_timer", setup_disable_8254_timer); __setup("enable_8254_timer", setup_enable_8254_timer); -#include <asm/pci-direct.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> - - -#ifdef CONFIG_ACPI - -static int nvidia_hpet_detected __initdata; - -static int __init nvidia_hpet_check(unsigned long phys, unsigned long size) -{ - nvidia_hpet_detected = 1; - return 0; -} -#endif - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - ... and others. Really should move this somewhere else. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_GART_IOMMU - if ((end_pfn > MAX_DMA32_PFN || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Override with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* - * All timer overrides on Nvidia are - * wrong unless HPET is enabled. - */ - nvidia_hpet_detected = 0; - acpi_table_parse(ACPI_HPET, - nvidia_hpet_check); - if (nvidia_hpet_detected == 0) { - acpi_skip_timer_override = 1; - printk(KERN_INFO "Nvidia board " - "detected. Ignoring ACPI " - "timer override.\n"); - } -#endif - /* RED-PEN skip them on mptables too? */ - return; - - /* This should be actually default, but - for 2.6.16 let's do it for ATI only where - it's really needed. */ - case PCI_VENDOR_ID_ATI: - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } - return; - } - - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); /* * Find the IRQ entry number of a certain pin. @@ -424,9 +307,7 @@ static int __init find_isa_irq_pin(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) @@ -442,9 +323,7 @@ static int __init find_isa_irq_apic(int irq, int type) for (i = 0; i < mp_irq_entries; i++) { int lbus = mp_irqs[i].mpc_srcbus; - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && + if (test_bit(lbus, mp_bus_not_pci) && (mp_irqs[i].mpc_irqtype == type) && (mp_irqs[i].mpc_srcbusirq == irq)) break; @@ -484,7 +363,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) mp_irqs[i].mpc_dstapic == MP_APIC_ALL) break; - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + if (!test_bit(lbus, mp_bus_not_pci) && !mp_irqs[i].mpc_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { @@ -507,27 +386,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ @@ -540,12 +398,6 @@ static int EISA_ELCR(unsigned int irq) #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - static int __init MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mpc_srcbus; @@ -557,38 +409,11 @@ static int __init MPBIOS_polarity(int idx) switch (mp_irqs[idx].mpc_irqflag & 3) { case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); break; - } case 1: /* high active */ { polarity = 0; @@ -626,38 +451,11 @@ static int MPBIOS_trigger(int idx) switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) { case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); break; - } case 1: /* edge */ { trigger = 0; @@ -763,49 +561,17 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mpc_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - irq = gsi_irq_sharing(irq); - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } - } - BUG_ON(irq >= NR_IRQS); - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mpc_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + irq = gsi_irq_sharing(irq); } BUG_ON(irq >= NR_IRQS); return irq; @@ -834,10 +600,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + unsigned long flags; + int vector; BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); return IO_APIC_VECTOR(irq); + } next: current_vector += 8; if (current_vector == IA32_SYSCALL_VECTOR) @@ -849,11 +622,14 @@ next: current_vector = FIRST_DEVICE_VECTOR + offset; } - vector_irq[current_vector] = irq; + vector = current_vector; + vector_irq[vector] = irq; if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; + IO_APIC_VECTOR(irq) = vector; + + spin_unlock_irqrestore(&vector_lock, flags); - return current_vector; + return vector; } extern void (*interrupt[NR_IRQS])(void); @@ -864,23 +640,18 @@ static struct hw_interrupt_type ioapic_edge_type; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } + unsigned idx; + + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].chip = &ioapic_level_type; + else + irq_desc[idx].chip = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); } static void __init setup_IO_APIC_irqs(void) @@ -937,9 +708,9 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -981,7 +752,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].handler = &ioapic_edge_type; + irq_desc[0].chip = &ioapic_edge_type; /* * Add it to the IO-APIC irq-routing table: @@ -1077,10 +848,7 @@ void __apicdebuginit print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1275,9 +1043,6 @@ static void __init enable_IO_APIC(void) irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; /* * The number of IO-APIC IRQ registers (== #pins): @@ -1293,11 +1058,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. @@ -1349,7 +1110,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1366,84 +1126,13 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); } /* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} - -/* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: * @@ -1611,6 +1300,13 @@ static void set_ioapic_affinity_vector (unsigned int vector, #endif // CONFIG_SMP #endif // CONFIG_PCI_MSI +static int ioapic_retrigger(unsigned int irq) +{ + send_IPI_self(IO_APIC_VECTOR(irq)); + + return 1; +} + /* * Level and edge triggered IO-APIC interrupts need different handling, * so we use two separate IRQ descriptors. Edge triggered IRQs can be @@ -1631,6 +1327,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = { #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity, #endif + .retrigger = ioapic_retrigger, }; static struct hw_interrupt_type ioapic_level_type __read_mostly = { @@ -1644,6 +1341,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = { #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity, #endif + .retrigger = ioapic_retrigger, }; static inline void init_IO_APIC_traps(void) @@ -1678,7 +1376,7 @@ static inline void init_IO_APIC_traps(void) make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].handler = &no_irq_type; + irq_desc[irq].chip = &no_irq_type; } } } @@ -1895,7 +1593,7 @@ static inline void check_timer(void) apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(0); - irq_desc[0].handler = &lapic_irq_type; + irq_desc[0].chip = &lapic_irq_type; apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -1949,11 +1647,6 @@ void __init setup_IO_APIC(void) apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); @@ -1972,17 +1665,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -2004,11 +1692,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2062,19 +1748,6 @@ device_initcall(ioapic_init_sysfs); #define IO_APIC_MAX_ID 0xFE -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -2133,10 +1806,10 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); + spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index b81614970ecc..fe063d3cfe42 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -56,6 +56,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); } /* diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index d8bd0b345b1e..b3677e6ccc6e 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -20,10 +20,29 @@ #include <asm/idle.h> atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif + +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ + u64 curbase = (u64) current->thread_info; + static unsigned long warned = -60*HZ; + + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && + regs->rsp < curbase + sizeof(struct thread_info) + 128 && + time_after(jiffies, warned + 60*HZ)) { + printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", + current->comm, curbase, regs->rsp); + show_stack(NULL,NULL); + warned = jiffies; + } +} #endif /* @@ -39,7 +58,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%d ",j); + seq_printf(p, "CPU%-8d",j); seq_putc(p, '\n'); } @@ -55,7 +74,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %14s", irq_desc[i].chip->typename); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -68,18 +87,11 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); seq_putc(p, '\n'); -#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif -#endif } return 0; } @@ -91,12 +103,20 @@ skip: */ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { - /* high bits used in ret_from_ code */ - unsigned irq = regs->orig_rax & 0xff; + /* high bit used in ret_from_ code */ + unsigned irq = ~regs->orig_rax; + + if (unlikely(irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } exit_idle(); irq_enter(); - +#ifdef CONFIG_DEBUG_STACKOVERFLOW + stack_overflow_check(regs); +#endif __do_IRQ(irq, regs); irq_exit(); @@ -114,13 +134,13 @@ void fixup_irqs(cpumask_t map) if (irq == 2) continue; - cpus_and(mask, irq_affinity[irq], map); + cpus_and(mask, irq_desc[irq].affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].handler->set_affinity) - irq_desc[irq].handler->set_affinity(irq, mask); + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); else if (irq_desc[irq].action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } @@ -145,8 +165,10 @@ asmlinkage void do_softirq(void) local_irq_save(flags); pending = local_softirq_pending(); /* Switch to interrupt stack */ - if (pending) + if (pending) { call_softirq(); + WARN_ON_ONCE(softirq_count()); + } local_irq_restore(flags); } EXPORT_SYMBOL(do_softirq); diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c new file mode 100644 index 000000000000..6416682d33d0 --- /dev/null +++ b/arch/x86_64/kernel/k8.c @@ -0,0 +1,118 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/k8.h> + +int num_k8_northbridges; +EXPORT_SYMBOL(num_k8_northbridges); + +static u32 *flush_words; + +struct pci_device_id k8_nb_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; +EXPORT_SYMBOL(k8_nb_ids); + +struct pci_dev **k8_northbridges; +EXPORT_SYMBOL(k8_northbridges); + +static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_ids[0], dev)); + return dev; +} + +int cache_k8_northbridges(void) +{ + int i; + struct pci_dev *dev; + if (num_k8_northbridges) + return 0; + + num_k8_northbridges = 0; + dev = NULL; + while ((dev = next_k8_northbridge(dev)) != NULL) + num_k8_northbridges++; + + k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), + GFP_KERNEL); + if (!k8_northbridges) + return -ENOMEM; + + flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + kfree(k8_northbridges); + return -ENOMEM; + } + + dev = NULL; + i = 0; + while ((dev = next_k8_northbridge(dev)) != NULL) { + k8_northbridges[i++] = dev; + pci_read_config_dword(dev, 0x9c, &flush_words[i]); + } + k8_northbridges[i] = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(cache_k8_northbridges); + +/* Ignores subdevice/subvendor but as far as I can figure out + they're useless anyways */ +int __init early_is_k8_nb(u32 device) +{ + struct pci_device_id *id; + u32 vendor = device & 0xffff; + device >>= 16; + for (id = k8_nb_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return 1; + return 0; +} + +void k8_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < num_k8_northbridges; i++) { + pci_write_config_dword(k8_northbridges[i], 0x9c, + flush_words[i]|1); + flushed++; + } + for (i = 0; i < num_k8_northbridges; i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(k8_northbridges[i], + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(k8_flush_garts); + diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index fa1d19ca700a..ffc73ac72485 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -31,7 +31,6 @@ * Added function return probes functionality */ -#include <linux/config.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/string.h> diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 25ac8a3faae6..0497e3bd5bff 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -15,6 +15,15 @@ #include <asm/mmu_context.h> #include <asm/io.h> +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -144,32 +153,19 @@ static void load_segments(void) ); } -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -const extern unsigned char relocate_new_kernel[]; -const extern unsigned long relocate_new_kernel_size; - int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable, control_code_buffer; + unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - return 0; } @@ -184,37 +180,40 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; - - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); - - - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again unless you set the segment to a different selector. - * - * The more common model are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. @@ -225,7 +224,36 @@ NORET_TYPE void machine_kexec(struct kimage *image) */ set_gdt(phys_to_virt(0),0); set_idt(phys_to_virt(0),0); + /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); +} + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init setup_crashkernel(char *arg) +{ + unsigned long size, base; + char *p; + if (!arg) + return -EINVAL; + size = memparse(arg, &p); + if (arg == p) + return -EINVAL; + if (*p == '@') { + base = memparse(p+1, &p); + /* FIXME: Do I want a sanity check to validate the + * memory range? Yes you do, but it's too early for + * e820 -AK */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; } +early_param("crashkernel", setup_crashkernel); + diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index c69fc43cee7b..bbea88801d88 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -182,7 +182,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) goto out2; memset(&m, 0, sizeof(struct mce)); - m.cpu = safe_smp_processor_id(); + m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -274,6 +274,33 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_dec(&mce_entry); } +#ifdef CONFIG_X86_MCE_INTEL +/*** + * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog + * @cpu: The CPU on which the event occured. + * @status: Event status information + * + * This function should be called by the thermal interrupt after the + * event has been processed and the decision was made to log the event + * further. + * + * The status parameter will be saved to the 'status' field of 'struct mce' + * and historically has been the register value of the + * MSR_IA32_THERMAL_STATUS (Intel) msr. + */ +void mce_log_therm_throt_event(unsigned int cpu, __u64 status) +{ + struct mce m; + + memset(&m, 0, sizeof(m)); + m.cpu = cpu; + m.bank = MCE_THERMAL_BANK; + m.status = status; + rdtscll(m.tsc); + mce_log(&m); +} +#endif /* CONFIG_X86_MCE_INTEL */ + /* * Periodic polling timer for "silent" machine check errors. */ @@ -562,7 +589,7 @@ static struct sysdev_class mce_sysclass = { set_kset_name("machinecheck"), }; -static DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -615,7 +642,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static __cpuinit void mce_remove_device(unsigned int cpu) +static void mce_remove_device(unsigned int cpu) { int i; @@ -626,7 +653,6 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); sysdev_unregister(&per_cpu(device_mce,cpu)); } -#endif /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int @@ -638,11 +664,9 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE: mce_create_device(cpu); break; -#ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: mce_remove_device(cpu); break; -#endif } return NOTIFY_OK; } @@ -650,6 +674,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; +#endif static __init int mce_init_device(void) { @@ -664,7 +689,7 @@ static __init int mce_init_device(void) mce_create_device(i); } - register_cpu_notifier(&mce_cpu_notifier); + register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); return err; } diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index d13b241ad094..883fe747f64c 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -1,5 +1,5 @@ /* - * (c) 2005 Advanced Micro Devices, Inc. + * (c) 2005, 2006 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -8,9 +8,10 @@ * * Support : jacob.shin@amd.com * - * MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F. - * MC4_MISC0 exists per physical processor. + * April 2006 + * - added support for AMD Family 0x10 processors * + * All MC4_MISCi registers are shared between multi-cores */ #include <linux/cpu.h> @@ -29,32 +30,45 @@ #include <asm/percpu.h> #include <asm/idle.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.00.9" -#define NR_BANKS 5 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 +#define PFX "mce_threshold: " +#define VERSION "version 1.1.1" +#define NR_BANKS 6 +#define NR_BLOCKS 9 +#define THRESHOLD_MAX 0xFFF +#define INT_TYPE_APIC 0x00020000 +#define MASK_VALID_HI 0x80000000 +#define MASK_LVTOFF_HI 0x00F00000 +#define MASK_COUNT_EN_HI 0x00080000 +#define MASK_INT_TYPE_HI 0x00060000 +#define MASK_OVERFLOW_HI 0x00010000 #define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_OVERFLOW 0x0001000000000000L +#define MASK_BLKPTR_LO 0xFF000000 +#define MCG_XBLK_ADDR 0xC0000400 -struct threshold_bank { +struct threshold_block { + unsigned int block; + unsigned int bank; unsigned int cpu; - u8 bank; - u8 interrupt_enable; + u32 address; + u16 interrupt_enable; u16 threshold_limit; struct kobject kobj; + struct list_head miscj; }; -static struct threshold_bank threshold_defaults = { +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = { .interrupt_enable = 0, .threshold_limit = THRESHOLD_MAX, }; +struct threshold_bank { + struct kobject kobj; + struct threshold_block *blocks; + cpumask_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); + #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 @@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ */ /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_bank *b, +static void threshold_restart_bank(struct threshold_block *b, int reset, u16 old_limit) { u32 mci_misc_hi, mci_misc_lo; - rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + rdmsr(b->address, mci_misc_lo, mci_misc_hi); if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) reset = 1; /* limit cannot be lower than err count */ @@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b, (mci_misc_hi &= ~MASK_INT_TYPE_HI); mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); + wrmsr(b->address, mci_misc_lo, mci_misc_hi); } +/* cpu init entry point, called from mce.c with preempt off */ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) { - int bank; - u32 mci_misc_lo, mci_misc_hi; + unsigned int bank, block; unsigned int cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; for (bank = 0; bank < NR_BANKS; ++bank) { - rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi); + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) + address = MCG_XBLK_ADDR + + ((low & MASK_BLKPTR_LO) >> 21); + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + continue; - /* !valid, !counter present, bios locked */ - if (!(mci_misc_hi & MASK_VALID_HI) || - !(mci_misc_hi & MASK_VALID_HI >> 1) || - (mci_misc_hi & MASK_VALID_HI >> 2)) - continue; + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } - per_cpu(bank_map, cpu) |= (1 << bank); + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + continue; + if (!block) + per_cpu(bank_map, cpu) |= (1 << bank); #ifdef CONFIG_SMP - if (shared_bank[bank] && cpu_core_id[cpu]) - continue; + if (shared_bank[bank] && c->cpu_core_id) + break; #endif + high &= ~MASK_LVTOFF_HI; + high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + wrmsr(address, low, high); - setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20); - threshold_defaults.cpu = cpu; - threshold_defaults.bank = bank; - threshold_restart_bank(&threshold_defaults, 0, 0); + setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); + + threshold_defaults.address = address; + threshold_restart_bank(&threshold_defaults, 0, 0); + } } } @@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) */ asmlinkage void mce_threshold_interrupt(void) { - int bank; + unsigned int bank, block; struct mce m; + u32 low = 0, high = 0, address = 0; ack_APIC_irq(); exit_idle(); @@ -150,15 +187,42 @@ asmlinkage void mce_threshold_interrupt(void) /* assume first bank caused it */ for (bank = 0; bank < NR_BANKS; ++bank) { - m.bank = MCE_THRESHOLD_BASE + bank; - rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc); + for (block = 0; block < NR_BLOCKS; ++block) { + if (block == 0) + address = MSR_IA32_MC0_MISC + bank * 4; + else if (block == 1) + address = MCG_XBLK_ADDR + + ((low & MASK_BLKPTR_LO) >> 21); + else + ++address; + + if (rdmsr_safe(address, &low, &high)) + continue; - if (m.misc & MASK_OVERFLOW) { - mce_log(&m); - goto out; + if (!(high & MASK_VALID_HI)) { + if (block) + continue; + else + break; + } + + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + continue; + + if (high & MASK_OVERFLOW_HI) { + rdmsrl(address, m.misc); + rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, + m.status); + m.bank = K8_MCE_THRESHOLD_BASE + + bank * NR_BLOCKS + + block; + mce_log(&m); + goto out; + } } } - out: +out: irq_exit(); } @@ -166,20 +230,12 @@ asmlinkage void mce_threshold_interrupt(void) * Sysfs Interface */ -static struct sysdev_class threshold_sysclass = { - set_kset_name("threshold"), -}; - -static DEFINE_PER_CPU(struct sys_device, device_threshold); - struct threshold_attr { - struct attribute attr; - ssize_t(*show) (struct threshold_bank *, char *); - ssize_t(*store) (struct threshold_bank *, const char *, size_t count); + struct attribute attr; + ssize_t(*show) (struct threshold_block *, char *); + ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); - static cpumask_t affinity_set(unsigned int cpu) { cpumask_t oldmask = current->cpus_allowed; @@ -194,15 +250,15 @@ static void affinity_restore(cpumask_t oldmask) set_cpus_allowed(current, oldmask); } -#define SHOW_FIELDS(name) \ - static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \ - { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ - } +#define SHOW_FIELDS(name) \ +static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ +{ \ + return sprintf(buf, "%lx\n", (unsigned long) b->name); \ +} SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_bank *b, +static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; @@ -219,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b, return end - buf; } -static ssize_t store_threshold_limit(struct threshold_bank *b, +static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; @@ -242,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b, return end - buf; } -static ssize_t show_error_count(struct threshold_bank *b, char *buf) +static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 high, low; cpumask_t oldmask; oldmask = affinity_set(b->cpu); - rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */ + rdmsr(b->address, low, high); affinity_restore(oldmask); return sprintf(buf, "%x\n", (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); } -static ssize_t store_error_count(struct threshold_bank *b, +static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { cpumask_t oldmask; @@ -269,13 +325,13 @@ static ssize_t store_error_count(struct threshold_bank *b, .store = _store, \ }; -#define ATTR_FIELDS(name) \ - static struct threshold_attr name = \ +#define RW_ATTR(name) \ +static struct threshold_attr name = \ THRESHOLD_ATTR(name, 0644, show_## name, store_## name) -ATTR_FIELDS(interrupt_enable); -ATTR_FIELDS(threshold_limit); -ATTR_FIELDS(error_count); +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count); static struct attribute *default_attrs[] = { &interrupt_enable.attr, @@ -284,12 +340,12 @@ static struct attribute *default_attrs[] = { NULL }; -#define to_bank(k) container_of(k,struct threshold_bank,kobj) -#define to_attr(a) container_of(a,struct threshold_attr,attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr) static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { - struct threshold_bank *b = to_bank(kobj); + struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; ret = a->show ? a->show(b, buf) : -EIO; @@ -299,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) static ssize_t store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { - struct threshold_bank *b = to_bank(kobj); + struct threshold_block *b = to_block(kobj); struct threshold_attr *a = to_attr(attr); ssize_t ret; ret = a->store ? a->store(b, buf, count) : -EIO; @@ -316,69 +372,174 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, + unsigned int bank, + unsigned int block, + u32 address) +{ + int err; + u32 low, high; + struct threshold_block *b = NULL; + + if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) + return 0; + + if (rdmsr_safe(address, &low, &high)) + goto recurse; + + if (!(high & MASK_VALID_HI)) { + if (block) + goto recurse; + else + return 0; + } + + if (!(high & MASK_VALID_HI >> 1) || + (high & MASK_VALID_HI >> 2)) + goto recurse; + + b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); + if (!b) + return -ENOMEM; + memset(b, 0, sizeof(struct threshold_block)); + + b->block = block; + b->bank = bank; + b->cpu = cpu; + b->address = address; + b->interrupt_enable = 0; + b->threshold_limit = THRESHOLD_MAX; + + INIT_LIST_HEAD(&b->miscj); + + if (per_cpu(threshold_banks, cpu)[bank]->blocks) + list_add(&b->miscj, + &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); + else + per_cpu(threshold_banks, cpu)[bank]->blocks = b; + + kobject_set_name(&b->kobj, "misc%i", block); + b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; + b->kobj.ktype = &threshold_ktype; + err = kobject_register(&b->kobj); + if (err) + goto out_free; +recurse: + if (!block) { + address = (low & MASK_BLKPTR_LO) >> 21; + if (!address) + return 0; + address += MCG_XBLK_ADDR; + } else + ++address; + + err = allocate_threshold_blocks(cpu, bank, ++block, address); + if (err) + goto out_free; + + return err; + +out_free: + if (b) { + kobject_unregister(&b->kobj); + kfree(b); + } + return err; +} + /* symlinks sibling shared banks to first core. first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, int bank) +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { - int err = 0; + int i, err = 0; struct threshold_bank *b = NULL; + cpumask_t oldmask = CPU_MASK_NONE; + char name[32]; + + sprintf(name, "threshold_bank%i", bank); #ifdef CONFIG_SMP - if (cpu_core_id[cpu] && shared_bank[bank]) { /* symlink */ - char name[16]; - unsigned lcpu = first_cpu(cpu_core_map[cpu]); - if (cpu_core_id[lcpu]) - goto out; /* first core not up yet */ + if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) { /* symlink */ + i = first_cpu(cpu_core_map[cpu]); + + /* first core not up yet */ + if (cpu_data[i].cpu_core_id) + goto out; + + /* already linked */ + if (per_cpu(threshold_banks, cpu)[bank]) + goto out; + + b = per_cpu(threshold_banks, i)[bank]; - b = per_cpu(threshold_banks, lcpu)[bank]; if (!b) goto out; - sprintf(name, "bank%i", bank); - err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj, + + err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj, &b->kobj, name); if (err) goto out; + + b->cpus = cpu_core_map[cpu]; per_cpu(threshold_banks, cpu)[bank] = b; goto out; } #endif - b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL); + b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } memset(b, 0, sizeof(struct threshold_bank)); - b->cpu = cpu; - b->bank = bank; - b->interrupt_enable = 0; - b->threshold_limit = THRESHOLD_MAX; - kobject_set_name(&b->kobj, "bank%i", bank); - b->kobj.parent = &per_cpu(device_threshold, cpu).kobj; - b->kobj.ktype = &threshold_ktype; - + kobject_set_name(&b->kobj, "threshold_bank%i", bank); + b->kobj.parent = &per_cpu(device_mce, cpu).kobj; +#ifndef CONFIG_SMP + b->cpus = CPU_MASK_ALL; +#else + b->cpus = cpu_core_map[cpu]; +#endif err = kobject_register(&b->kobj); - if (err) { - kfree(b); - goto out; - } + if (err) + goto out_free; + per_cpu(threshold_banks, cpu)[bank] = b; - out: + + oldmask = affinity_set(cpu); + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + affinity_restore(oldmask); + + if (err) + goto out_free; + + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) + continue; + + err = sysfs_create_link(&per_cpu(device_mce, i).kobj, + &b->kobj, name); + if (err) + goto out; + + per_cpu(threshold_banks, i)[bank] = b; + } + + goto out; + +out_free: + per_cpu(threshold_banks, cpu)[bank] = NULL; + kfree(b); +out: return err; } /* create dir/files for all valid threshold banks */ static __cpuinit int threshold_create_device(unsigned int cpu) { - int bank; + unsigned int bank; int err = 0; - per_cpu(device_threshold, cpu).id = cpu; - per_cpu(device_threshold, cpu).cls = &threshold_sysclass; - err = sysdev_register(&per_cpu(device_threshold, cpu)); - if (err) - goto out; - for (bank = 0; bank < NR_BANKS; ++bank) { if (!(per_cpu(bank_map, cpu) & 1 << bank)) continue; @@ -386,7 +547,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu) if (err) goto out; } - out: +out: return err; } @@ -397,89 +558,76 @@ static __cpuinit int threshold_create_device(unsigned int cpu) * of shared sysfs dir/files, and rest of the cores will be symlinked to it. */ -/* cpu hotplug call removes all symlinks before first core dies */ -static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank) +static void deallocate_threshold_block(unsigned int cpu, + unsigned int bank) { + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + + if (!head) + return; + + list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { + kobject_unregister(&pos->kobj); + list_del(&pos->miscj); + kfree(pos); + } + + kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); + per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} + +static void threshold_remove_bank(unsigned int cpu, int bank) +{ + int i = 0; struct threshold_bank *b; - char name[16]; + char name[32]; b = per_cpu(threshold_banks, cpu)[bank]; + if (!b) return; - if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) { - sprintf(name, "bank%i", bank); - sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name); + + if (!b->blocks) + goto free_out; + + sprintf(name, "threshold_bank%i", bank); + + /* sibling symlink */ + if (shared_bank[bank] && b->blocks->cpu != cpu) { + sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; - } else { - kobject_unregister(&b->kobj); - kfree(per_cpu(threshold_banks, cpu)[bank]); + return; } -} -static __cpuinit void threshold_remove_device(unsigned int cpu) -{ - int bank; - - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + /* remove all sibling symlinks before unregistering */ + for_each_cpu_mask(i, b->cpus) { + if (i == cpu) continue; - threshold_remove_bank(cpu, bank); + + sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); + per_cpu(threshold_banks, i)[bank] = NULL; } - sysdev_unregister(&per_cpu(device_threshold, cpu)); -} -/* link all existing siblings when first core comes up */ -static __cpuinit int threshold_create_symlinks(unsigned int cpu) -{ - int bank, err = 0; - unsigned int lcpu = 0; + deallocate_threshold_block(cpu, bank); - if (cpu_core_id[cpu]) - return 0; - for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { - if (lcpu == cpu) - continue; - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - if (!shared_bank[bank]) - continue; - err = threshold_create_bank(lcpu, bank); - } - } - return err; +free_out: + kobject_unregister(&b->kobj); + kfree(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; } -/* remove all symlinks before first core dies. */ -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) +static void threshold_remove_device(unsigned int cpu) { - int bank; - unsigned int lcpu = 0; - if (cpu_core_id[cpu]) - return; - for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { - if (lcpu == cpu) + unsigned int bank; + + for (bank = 0; bank < NR_BANKS; ++bank) { + if (!(per_cpu(bank_map, cpu) & 1 << bank)) continue; - for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) - continue; - if (!shared_bank[bank]) - continue; - threshold_remove_bank(lcpu, bank); - } + threshold_remove_bank(cpu, bank); } } -#else /* !CONFIG_HOTPLUG_CPU */ -static __cpuinit void threshold_create_symlinks(unsigned int cpu) -{ -} -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) -{ -} -static void threshold_remove_device(unsigned int cpu) -{ -} -#endif /* get notified when a cpu comes on/off */ static int threshold_cpu_callback(struct notifier_block *nfb, @@ -494,13 +642,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: threshold_create_device(cpu); - threshold_create_symlinks(cpu); - break; - case CPU_DOWN_PREPARE: - threshold_remove_symlinks(cpu); - break; - case CPU_DOWN_FAILED: - threshold_create_symlinks(cpu); break; case CPU_DEAD: threshold_remove_device(cpu); @@ -515,26 +656,20 @@ static int threshold_cpu_callback(struct notifier_block *nfb, static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; +#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { - int err; - int lcpu = 0; - - err = sysdev_class_register(&threshold_sysclass); - if (err) - goto out; + unsigned lcpu = 0; /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - err = threshold_create_device(lcpu); + int err = threshold_create_device(lcpu); if (err) - goto out; + return err; } - register_cpu_notifier(&threshold_cpu_notifier); - - out: - return err; + register_hotcpu_notifier(&threshold_cpu_notifier); + return 0; } device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c index 8f533d2c40cb..6551505d8a2c 100644 --- a/arch/x86_64/kernel/mce_intel.c +++ b/arch/x86_64/kernel/mce_intel.c @@ -11,36 +11,21 @@ #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/idle.h> - -static DEFINE_PER_CPU(unsigned long, next_check); +#include <asm/therm_throt.h> asmlinkage void smp_thermal_interrupt(void) { - struct mce m; + __u64 msr_val; ack_APIC_irq(); exit_idle(); irq_enter(); - if (time_before(jiffies, __get_cpu_var(next_check))) - goto done; - - __get_cpu_var(next_check) = jiffies + HZ*300; - memset(&m, 0, sizeof(m)); - m.cpu = smp_processor_id(); - m.bank = MCE_THERMAL_BANK; - rdtscll(m.tsc); - rdmsrl(MSR_IA32_THERM_STATUS, m.status); - if (m.status & 0x1) { - printk(KERN_EMERG - "CPU%d: Temperature above threshold, cpu clock throttled\n", m.cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_EMERG "CPU%d: Temperature/speed normal\n", m.cpu); - } - mce_log(&m); -done: + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & 1)) + mce_log_therm_throt_event(smp_processor_id(), msr_val); + irq_exit(); } @@ -92,6 +77,9 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", cpu, tm2 ? "TM2" : "TM1"); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c index bac195c74bcc..9d0958ff547f 100644 --- a/arch/x86_64/kernel/module.c +++ b/arch/x86_64/kernel/module.c @@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs, return -ENOSYS; } -extern void apply_alternatives(void *start, void *end); - int module_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) + const Elf_Shdr *sechdrs, + struct module *me) { - const Elf_Shdr *s; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - /* look for .altinstructions to patch */ - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - void *seg; - if (strcmp(".altinstructions", secstrings + s->sh_name)) - continue; - seg = (void *)s->sh_addr; - apply_alternatives(seg, seg + s->sh_size); - } + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks= s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } return 0; } void module_arch_cleanup(struct module *mod) { + alternatives_smp_module_del(mod); } diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 083da7e606b1..b8d53dfa9931 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -16,7 +16,6 @@ #include <linux/mm.h> #include <linux/init.h> #include <linux/delay.h> -#include <linux/config.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> #include <linux/kernel_stat.h> @@ -42,8 +41,7 @@ int acpi_found_madt; * Various Linux-internal data structures created from the * MP-table. */ -unsigned char apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; static int mp_current_pci_id = 0; @@ -57,7 +55,6 @@ struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; int mp_irq_entries; int nr_ioapics; -int pic_mode; unsigned long mp_lapic_addr = 0; @@ -72,19 +69,6 @@ unsigned disabled_cpus __initdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI -extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI*/ - u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -109,24 +93,20 @@ static int __init mpf_checksum(unsigned char *mp, int len) static void __cpuinit MP_processor_info (struct mpc_config_processor *m) { int cpu; - unsigned char ver; cpumask_t tmp_map; + char *bootup_cpu = ""; if (!(m->mpc_cpuflag & CPU_ENABLED)) { disabled_cpus++; return; } - - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); + bootup_cpu = " (Bootup-CPU)"; boot_cpu_id = m->mpc_apicid; } + + printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu); + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -137,24 +117,7 @@ static void __cpuinit MP_processor_info (struct mpc_config_processor *m) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); -#if MAX_APICS < 255 - if ((int)m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } -#endif - ver = m->mpc_apicver; - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* * bios_cpu_apicid is required to have processors listed @@ -179,37 +142,42 @@ static void __init MP_bus_info (struct mpc_config_bus *m) Dprintk("Bus #%d is %s\n", m->mpc_busid, str); if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + set_bit(m->mpc_busid, mp_bus_not_pci); } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + clear_bit(m->mpc_busid, mp_bus_not_pci); mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; } else { printk(KERN_ERR "Unknown bustype %s\n", str); } } +static int bad_ioapic(unsigned long address) +{ + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in table, skipping!\n"); + return 1; + } + return 0; +} + static void __init MP_ioapic_info (struct mpc_config_ioapic *m) { if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); + printk("I/O APIC #%d at 0x%X.\n", + m->mpc_apicid, m->mpc_apicaddr); + + if (bad_ioapic(m->mpc_apicaddr)) return; - } + mp_ioapics[nr_ioapics] = *m; nr_ioapics++; } @@ -233,19 +201,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } /* @@ -259,7 +214,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) unsigned char *mpt=((unsigned char *)mpc)+count; if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", + printk("MPTABLE: bad signature [%c%c%c%c]!\n", mpc->mpc_signature[0], mpc->mpc_signature[1], mpc->mpc_signature[2], @@ -267,31 +222,31 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) return 0; } if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); + printk("MPTABLE: checksum error!\n"); return 0; } if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n", mpc->mpc_spec); return 0; } if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + printk(KERN_ERR "MPTABLE: null local APIC address!\n"); return 0; } memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); + str[8] = 0; + printk(KERN_INFO "MPTABLE: OEM ID: %s ",str); memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); + str[12] = 0; + printk("MPTABLE: Product ID: %s ",str); - printk("APIC at: 0x%X\n",mpc->mpc_lapic); + printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic); /* save the local APIC address, it might be non-default */ if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; + mp_lapic_addr = mpc->mpc_lapic; /* * Now process the configuration blocks. @@ -303,7 +258,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_processor *m= (struct mpc_config_processor *)mpt; if (!acpi_lapic) - MP_processor_info(m); + MP_processor_info(m); mpt += sizeof(*m); count += sizeof(*m); break; @@ -322,8 +277,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_ioapic *m= (struct mpc_config_ioapic *)mpt; MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_INTSRC: @@ -332,8 +287,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) (struct mpc_config_intsrc *)mpt; MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } case MP_LINTSRC: @@ -341,15 +296,15 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) struct mpc_config_lintsrc *m= (struct mpc_config_lintsrc *)mpt; MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); + mpt += sizeof(*m); + count += sizeof(*m); break; } } } clustered_apic_check(); if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); + printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; } @@ -445,13 +400,10 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) * 2 CPUs, numbered 0 & 1. */ processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_apicver = 0; processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; for (i = 0; i < 2; i++) { @@ -470,14 +422,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) case 5: memcpy(bus.mpc_bustype, "ISA ", 6); break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); } MP_bus_info(&bus); if (mpc_default_type > 4) { @@ -488,7 +432,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) ioapic.mpc_type = MP_IOAPIC; ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_apicver = 0; ioapic.mpc_flags = MPC_APIC_USABLE; ioapic.mpc_apicaddr = 0xFEC00000; MP_ioapic_info(&ioapic); @@ -531,13 +475,6 @@ void __init get_smp_config (void) printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } /* * Now see if we need to read further. @@ -617,7 +554,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) return 0; } -void __init find_intel_smp (void) +void __init find_smp_config(void) { unsigned int address; @@ -634,9 +571,7 @@ void __init find_intel_smp (void) smp_scan_config(0xF0000,0x10000)) return; /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. + * If it is an SMP machine we should know now. * * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E, calculate and scan it here. @@ -657,69 +592,41 @@ void __init find_intel_smp (void) printk(KERN_INFO "No mptable found.\n"); } -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - find_intel_smp(); -#endif -} - - /* -------------------------------------------------------------------------- ACPI-based MP Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; - set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); - if (boot_cpu_id == -1U) boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __cpuinit mp_register_lapic ( - u8 id, - u8 enabled) +void __cpuinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; int boot_cpu = 0; - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) + if (id == boot_cpu_id) boot_cpu = 1; processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_apicver = 0; processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_cpufeature = 0; + processor.mpc_featureflag = 0; processor.mpc_reserved[0] = 0; processor.mpc_reserved[1] = 0; MP_processor_info(&processor); } -#ifdef CONFIG_X86_IO_APIC - #define MP_ISA_BUS 0 #define MP_MAX_IOAPIC_PIN 127 @@ -730,11 +637,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic(int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -744,28 +649,15 @@ static int mp_find_ioapic ( } printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; + int idx = 0; - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); + if (bad_ioapic(address)) return; - } idx = nr_ioapics++; @@ -775,7 +667,7 @@ void __init mp_register_ioapic ( set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); mp_ioapics[idx].mpc_apicid = id; - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + mp_ioapics[idx].mpc_apicver = 0; /* * Build basic IRQ lookup table to facilitate gsi->io_apic lookups @@ -786,21 +678,15 @@ void __init mp_register_ioapic ( mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -838,22 +724,18 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } - -void __init mp_config_acpi_legacy_irqs (void) +void __init mp_config_acpi_legacy_irqs(void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + set_bit(MP_ISA_BUS, mp_bus_not_pci); /* * Locate the IOAPIC that manages the ISA IRQs (0-15). @@ -906,24 +788,22 @@ void __init mp_config_acpi_legacy_irqs (void) if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); } - - return; } #define MAX_GSI_NUM 4096 int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrupts, which * represent all possible interrupts, to the IRQs * assigned to actual devices. */ - static int gsi_to_irq[MAX_GSI_NUM]; + static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -997,6 +877,4 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) polarity == ACPI_ACTIVE_HIGH ? 0 : 1); return gsi; } - -#endif /*CONFIG_X86_IO_APIC*/ #endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 4e6357fe0ec3..7af9cb3e2d99 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -12,14 +12,9 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> #include <linux/module.h> #include <linux/sysdev.h> #include <linux/nmi.h> @@ -27,77 +22,148 @@ #include <linux/kprobes.h> #include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> #include <asm/nmi.h> -#include <asm/msr.h> #include <asm/proto.h> #include <asm/kdebug.h> -#include <asm/local.h> #include <asm/mce.h> +#include <asm/intel_arch_perfmon.h> -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. +int unknown_nmi_panic; +int nmi_watchdog_enabled; +int panic_on_unrecovered_nmi; + +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little + */ +static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) +#define NMI_MAX_COUNTER_BITS 66 /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; /* oprofile uses this */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +/* local prototypes */ +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + else + return (msr - MSR_P4_BPU_PERFCTR0); + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + else + return (msr - MSR_P4_BSU_ESCR0); + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner))) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)); +} static __cpuinit inline int nmi_known_cpu(void) { @@ -105,13 +171,16 @@ static __cpuinit inline int nmi_known_cpu(void) case X86_VENDOR_AMD: return boot_cpu_data.x86 == 15; case X86_VENDOR_INTEL: - return boot_cpu_data.x86 == 15; + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return (boot_cpu_data.x86 == 15); } return 0; } /* Run after command line and cpu_init init, but before all other checks */ -void __cpuinit nmi_watchdog_default(void) +void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -129,7 +198,7 @@ void __cpuinit nmi_watchdog_default(void) static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -147,6 +216,12 @@ int __init check_nmi_watchdog (void) int *counts; int cpu; + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!counts) return -1; @@ -164,26 +239,43 @@ int __init check_nmi_watchdog (void) mdelay((10*1000)/nmi_hz); // wait 10 ticks for_each_online_cpu(cpu) { + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - nmi_perfctr_msr = 0; - kfree(counts); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(counts); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + nmi_hz = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; + } + } kfree(counts); return 0; @@ -203,8 +295,11 @@ int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; + + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ nmi_watchdog = nmi; return 1; } @@ -213,77 +308,52 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 15) { - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; -} -static void enable_lapic_nmi_watchdog(void) -{ - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - touch_nmi_watchdog(); - setup_apic_nmi_watchdog(); - } + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } -int reserve_lapic_nmi(void) +static void enable_lapic_nmi_watchdog(void) { - unsigned int old_owner; + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; -void release_lapic_nmi(void) -{ - unsigned int new_owner; + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; disable_irq(0); - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); enable_irq(0); } } @@ -294,15 +364,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; - disable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -321,7 +396,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -339,106 +420,332 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING -static void setup_k7_watchdog(void) +static int setup_k7_watchdog(void) { - int i; + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; - for(i = 0; i < 4; ++i) { - /* Simulator may not support it */ - if (checking_wrmsrl(MSR_K7_EVNTSEL0+i, 0UL)) { - nmi_perfctr_msr = 0; - return; - } - wrmsrl(MSR_K7_PERFCTR0+i, 0UL); - } + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + /* Simulator may not support it */ + if (checking_wrmsrl(evntsel_msr, 0UL)) + goto fail2; + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - wrmsrl(MSR_K7_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; + return 1; +fail2: + release_evntsel_nmi(evntsel_msr); +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; } +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif - - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); + ht_num = 0; + + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); } else { - clear_msr_range(0x3A0, 31); + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); - - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz * 1000UL / nmi_hz)); - wrmsrl(MSR_P4_IQ_COUNTER0, -((u64)cpu_khz * 1000 / nmi_hz)); + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; } -void setup_apic_nmi_watchdog(void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 15) - return; - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - setup_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 != 15) - return; - if (!setup_p4_watchdog()) + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog(void *unused) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + if (!setup_k7_watchdog()) + return; + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } + if (!setup_p4_watchdog()) + return; + break; + default: return; - break; + } + } + wd->enabled = 1; + atomic_inc(&nmi_active); +} + +void stop_apic_nmi_watchdog(void *unused) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; - default: + if (wd->enabled == 0) return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) + return; + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } + stop_p4_watchdog(); + break; + default: + return; + } } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + wd->enabled = 0; + atomic_dec(&nmi_active); } /* @@ -471,83 +778,108 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } -void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + int rc=0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } sum = read_pda(apic_timer_irqs); if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ if (atomic_read(&mce_entry) > 0) touched = 1; #endif + /* if the apic timer isn't firing, this cpu isn't doing much */ if (!touched && __get_cpu_var(last_irq_sum) == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - local_set(&__get_cpu_var(alert_counter), 0); - return; - } - die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); - } + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, + panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* + * ArchPerfom/Core Duo needs to re-unmask + * the apic vector + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + } else + printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); } +done: + return rc; } -static __kprobes int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { - int cpu = safe_smp_processor_id(); - nmi_enter(); add_pda(__nmi_count,1); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) +int do_nmi_callback(struct pt_regs * regs, int cpu) { - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; } #ifdef CONFIG_SYSCTL @@ -557,36 +889,42 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf,regs); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ return 0; } /* - * proc handler for /proc/sys/kernel/unknown_nmi_panic + * proc handler for /proc/sys/kernel/nmi */ -int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; - old_state = unknown_nmi_panic; + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) + if (!!old_state == !!nmi_watchdog_enabled) return 0; - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } + if (atomic_read(&nmi_active) < 0) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + /* if nmi_watchdog is not set yet, then set it */ + nmi_watchdog_default(); + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); } else { - release_lapic_nmi(); - unset_nmi_callback(); + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; } return 0; } @@ -595,8 +933,12 @@ int proc_unknown_nmi_panic(struct ctl_table *table, int write, struct file *file EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c new file mode 100644 index 000000000000..cfb09b07ae99 --- /dev/null +++ b/arch/x86_64/kernel/pci-calgary.c @@ -0,0 +1,1069 @@ +/* + * Derived from arch/powerpc/kernel/iommu.c + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <asm/proto.h> +#include <asm/calgary.h> +#include <asm/tce.h> +#include <asm/pci-direct.h> +#include <asm/system.h> +#include <asm/dma.h> + +#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 +#define PCI_VENDOR_DEVICE_ID_CALGARY \ + (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) + +/* we need these for register space address calculation */ +#define START_ADDRESS 0xfe000000 +#define CHASSIS_BASE 0 +#define ONE_BASED_CHASSIS_NUM 1 + +/* register offsets inside the host bridge space */ +#define PHB_CSR_OFFSET 0x0110 +#define PHB_PLSSR_OFFSET 0x0120 +#define PHB_CONFIG_RW_OFFSET 0x0160 +#define PHB_IOBASE_BAR_LOW 0x0170 +#define PHB_IOBASE_BAR_HIGH 0x0180 +#define PHB_MEM_1_LOW 0x0190 +#define PHB_MEM_1_HIGH 0x01A0 +#define PHB_IO_ADDR_SIZE 0x01B0 +#define PHB_MEM_1_SIZE 0x01C0 +#define PHB_MEM_ST_OFFSET 0x01D0 +#define PHB_AER_OFFSET 0x0200 +#define PHB_CONFIG_0_HIGH 0x0220 +#define PHB_CONFIG_0_LOW 0x0230 +#define PHB_CONFIG_0_END 0x0240 +#define PHB_MEM_2_LOW 0x02B0 +#define PHB_MEM_2_HIGH 0x02C0 +#define PHB_MEM_2_SIZE_HIGH 0x02D0 +#define PHB_MEM_2_SIZE_LOW 0x02E0 +#define PHB_DOSHOLE_OFFSET 0x08E0 + +/* PHB_CONFIG_RW */ +#define PHB_TCE_ENABLE 0x20000000 +#define PHB_SLOT_DISABLE 0x1C000000 +#define PHB_DAC_DISABLE 0x01000000 +#define PHB_MEM2_ENABLE 0x00400000 +#define PHB_MCSR_ENABLE 0x00100000 +/* TAR (Table Address Register) */ +#define TAR_SW_BITS 0x0000ffffffff800fUL +#define TAR_VALID 0x0000000000000008UL +/* CSR (Channel/DMA Status Register) */ +#define CSR_AGENT_MASK 0xffe0ffff + +#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ +#define MAX_NUM_CHASSIS 8 /* max number of chassis */ +/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ +#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) +#define PHBS_PER_CALGARY 4 + +/* register offsets in Calgary's internal register space */ +static const unsigned long tar_offsets[] = { + 0x0580 /* TAR0 */, + 0x0588 /* TAR1 */, + 0x0590 /* TAR2 */, + 0x0598 /* TAR3 */ +}; + +static const unsigned long split_queue_offsets[] = { + 0x4870 /* SPLIT QUEUE 0 */, + 0x5870 /* SPLIT QUEUE 1 */, + 0x6870 /* SPLIT QUEUE 2 */, + 0x7870 /* SPLIT QUEUE 3 */ +}; + +static const unsigned long phb_offsets[] = { + 0x8000 /* PHB0 */, + 0x9000 /* PHB1 */, + 0xA000 /* PHB2 */, + 0xB000 /* PHB3 */ +}; + +unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; +static int translate_empty_slots __read_mostly = 0; +static int calgary_detected __read_mostly = 0; + +struct calgary_bus_info { + void *tce_space; + unsigned char translation_disabled; + signed char phbid; +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; + +static void tce_cache_blast(struct iommu_table *tbl); + +/* enable this to stress test the chip's TCE cache */ +#ifdef CONFIG_IOMMU_DEBUG +int debugging __read_mostly = 1; + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + unsigned long idx = start; + + BUG_ON(start >= end); + + while (idx < end) { + if (!!test_bit(idx, bitmap) != expected) + return idx; + ++idx; + } + + /* all bits have the expected value */ + return ~0UL; +} +#else /* debugging is disabled */ +int debugging __read_mostly = 0; + +static inline unsigned long verify_bit_range(unsigned long* bitmap, + int expected, unsigned long start, unsigned long end) +{ + return ~0UL; +} +#endif /* CONFIG_IOMMU_DEBUG */ + +static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) +{ + unsigned int npages; + + npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); + npages >>= PAGE_SHIFT; + + return npages; +} + +static inline int translate_phb(struct pci_dev* dev) +{ + int disabled = bus_info[dev->bus->number].translation_disabled; + return !disabled; +} + +static void iommu_range_reserve(struct iommu_table *tbl, + unsigned long start_addr, unsigned int npages) +{ + unsigned long index; + unsigned long end; + unsigned long badbit; + + index = start_addr >> PAGE_SHIFT; + + /* bail out if we're asked to reserve a region we don't cover */ + if (index >= tbl->it_size) + return; + + end = index + npages; + if (end > tbl->it_size) /* don't go off the table */ + end = tbl->it_size; + + badbit = verify_bit_range(tbl->it_map, 0, index, end); + if (badbit != ~0UL) { + if (printk_ratelimit()) + printk(KERN_ERR "Calgary: entry already allocated at " + "0x%lx tbl %p dma 0x%lx npages %u\n", + badbit, tbl, start_addr, npages); + } + + set_bit_string(tbl->it_map, index, npages); +} + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, + unsigned int npages) +{ + unsigned long offset; + + BUG_ON(npages == 0); + + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, + tbl->it_size, npages); + if (offset == ~0UL) { + tce_cache_blast(tbl); + offset = find_next_zero_string(tbl->it_map, 0, + tbl->it_size, npages); + if (offset == ~0UL) { + printk(KERN_WARNING "Calgary: IOMMU full.\n"); + if (panic_on_overflow) + panic("Calgary: fix the allocator.\n"); + else + return bad_dma_address; + } + } + + set_bit_string(tbl->it_map, offset, npages); + tbl->it_hint = offset + npages; + BUG_ON(tbl->it_hint > tbl->it_size); + + return offset; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, + unsigned int npages, int direction) +{ + unsigned long entry, flags; + dma_addr_t ret = bad_dma_address; + + spin_lock_irqsave(&tbl->it_lock, flags); + + entry = iommu_range_alloc(tbl, npages); + + if (unlikely(entry == bad_dma_address)) + goto error; + + /* set the return dma address */ + ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); + + /* put the TCEs in the HW table */ + tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, + direction); + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return ret; + +error: + spin_unlock_irqrestore(&tbl->it_lock, flags); + printk(KERN_WARNING "Calgary: failed to allocate %u pages in " + "iommu %p\n", npages, tbl); + return bad_dma_address; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry; + unsigned long badbit; + + entry = dma_addr >> PAGE_SHIFT; + + BUG_ON(entry + npages > tbl->it_size); + + tce_free(tbl, entry, npages); + + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); + if (badbit != ~0UL) { + if (printk_ratelimit()) + printk(KERN_ERR "Calgary: bit is off at 0x%lx " + "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", + badbit, tbl, dma_addr, entry, npages); + } + + __clear_bit_string(tbl->it_map, entry, npages); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long flags; + + spin_lock_irqsave(&tbl->it_lock, flags); + + __iommu_free(tbl, dma_addr, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static void __calgary_unmap_sg(struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, int direction) +{ + while (nelems--) { + unsigned int npages; + dma_addr_t dma = sglist->dma_address; + unsigned int dmalen = sglist->dma_length; + + if (dmalen == 0) + break; + + npages = num_dma_pages(dma, dmalen); + __iommu_free(tbl, dma, npages); + sglist++; + } +} + +void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int direction) +{ + unsigned long flags; + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + + if (!translate_phb(to_pci_dev(dev))) + return; + + spin_lock_irqsave(&tbl->it_lock, flags); + + __calgary_unmap_sg(tbl, sglist, nelems, direction); + + spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static int calgary_nontranslate_map_sg(struct device* dev, + struct scatterlist *sg, int nelems, int direction) +{ + int i; + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + s->dma_address = virt_to_bus(page_address(s->page) +s->offset); + s->dma_length = s->length; + } + return nelems; +} + +int calgary_map_sg(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + unsigned long flags; + unsigned long vaddr; + unsigned int npages; + unsigned long entry; + int i; + + if (!translate_phb(to_pci_dev(dev))) + return calgary_nontranslate_map_sg(dev, sg, nelems, direction); + + spin_lock_irqsave(&tbl->it_lock, flags); + + for (i = 0; i < nelems; i++ ) { + struct scatterlist *s = &sg[i]; + BUG_ON(!s->page); + + vaddr = (unsigned long)page_address(s->page) + s->offset; + npages = num_dma_pages(vaddr, s->length); + + entry = iommu_range_alloc(tbl, npages); + if (entry == bad_dma_address) { + /* makes sure unmap knows to stop */ + s->dma_length = 0; + goto error; + } + + s->dma_address = (entry << PAGE_SHIFT) | s->offset; + + /* insert into HW table */ + tce_build(tbl, entry, npages, vaddr & PAGE_MASK, + direction); + + s->dma_length = s->length; + } + + spin_unlock_irqrestore(&tbl->it_lock, flags); + + return nelems; +error: + __calgary_unmap_sg(tbl, sg, nelems, direction); + for (i = 0; i < nelems; i++) { + sg[i].dma_address = bad_dma_address; + sg[i].dma_length = 0; + } + spin_unlock_irqrestore(&tbl->it_lock, flags); + return 0; +} + +dma_addr_t calgary_map_single(struct device *dev, void *vaddr, + size_t size, int direction) +{ + dma_addr_t dma_handle = bad_dma_address; + unsigned long uaddr; + unsigned int npages; + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + + uaddr = (unsigned long)vaddr; + npages = num_dma_pages(uaddr, size); + + if (translate_phb(to_pci_dev(dev))) + dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + else + dma_handle = virt_to_bus(vaddr); + + return dma_handle; +} + +void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + unsigned int npages; + + if (!translate_phb(to_pci_dev(dev))) + return; + + npages = num_dma_pages(dma_handle, size); + iommu_free(tbl, dma_handle, npages); +} + +void* calgary_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + struct iommu_table *tbl; + + tbl = to_pci_dev(dev)->bus->self->sysdata; + + size = PAGE_ALIGN(size); /* size rounded up to full pages */ + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(flag, order); + if (!ret) + goto error; + memset(ret, 0, size); + + if (translate_phb(to_pci_dev(dev))) { + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + + *dma_handle = mapping; + } else /* non translated slot */ + *dma_handle = virt_to_bus(ret); + + return ret; + +free: + free_pages((unsigned long)ret, get_order(size)); + ret = NULL; +error: + return ret; +} + +static struct dma_mapping_ops calgary_dma_ops = { + .alloc_coherent = calgary_alloc_coherent, + .map_single = calgary_map_single, + .unmap_single = calgary_unmap_single, + .map_sg = calgary_map_sg, + .unmap_sg = calgary_unmap_sg, +}; + +static inline int busno_to_phbid(unsigned char num) +{ + return bus_info[num].phbid; +} + +static inline unsigned long split_queue_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return split_queue_offsets[idx]; +} + +static inline unsigned long tar_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return tar_offsets[idx]; +} + +static inline unsigned long phb_offset(unsigned char num) +{ + size_t idx = busno_to_phbid(num); + + return phb_offsets[idx]; +} + +static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) +{ + unsigned long target = ((unsigned long)bar) | offset; + return (void __iomem*)target; +} + +static void tce_cache_blast(struct iommu_table *tbl) +{ + u64 val; + u32 aer; + int i = 0; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + + /* disable arbitration on the bus */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + aer = readl(target); + writel(0, target); + + /* read plssr to ensure it got there */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + val = readl(target); + + /* poll split queues until all DMA activity is done */ + target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); + do { + val = readq(target); + i++; + } while ((val & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "Calgary: PCI bus not quiesced, " + "continuing anyway\n"); + + /* invalidate TCE cache */ + target = calgary_reg(bbar, tar_offset(tbl->it_busno)); + writeq(tbl->tar_val, target); + + /* enable arbitration */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); + writel(aer, target); + (void)readl(target); /* flush */ +} + +static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, + u64 limit) +{ + unsigned int numpages; + + limit = limit | 0xfffff; + limit++; + + numpages = ((limit - start) >> PAGE_SHIFT); + iommu_range_reserve(dev->sysdata, start, numpages); +} + +static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) +{ + void __iomem *target; + u64 low, high, sizelow; + u64 start, limit; + struct iommu_table *tbl = dev->sysdata; + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* peripheral MEM_1 region */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); + sizelow = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) +{ + void __iomem *target; + u32 val32; + u64 low, high, sizelow, sizehigh; + u64 start, limit; + struct iommu_table *tbl = dev->sysdata; + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + + /* is it enabled? */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + if (!(val32 & PHB_MEM2_ENABLE)) + return; + + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); + low = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); + high = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); + sizelow = be32_to_cpu(readl(target)); + target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); + sizehigh = be32_to_cpu(readl(target)); + + start = (high << 32) | low; + limit = (sizehigh << 32) | sizelow; + + calgary_reserve_mem_region(dev, start, limit); +} + +/* + * some regions of the IO address space do not get translated, so we + * must not give devices IO addresses in those regions. The regions + * are the 640KB-1MB region and the two PCI peripheral memory holes. + * Reserve all of them in the IOMMU bitmap to avoid giving them out + * later. + */ +static void __init calgary_reserve_regions(struct pci_dev *dev) +{ + unsigned int npages; + void __iomem *bbar; + unsigned char busnum; + u64 start; + struct iommu_table *tbl = dev->sysdata; + + bbar = tbl->bbar; + busnum = dev->bus->number; + + /* reserve bad_dma_address in case it's a legal address */ + iommu_range_reserve(tbl, bad_dma_address, 1); + + /* avoid the BIOS/VGA first 640KB-1MB region */ + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + iommu_range_reserve(tbl, start, npages); + + /* reserve the two PCI peripheral memory regions in IO space */ + calgary_reserve_peripheral_mem_1(dev); + calgary_reserve_peripheral_mem_2(dev); +} + +static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) +{ + u64 val64; + u64 table_phys; + void __iomem *target; + int ret; + struct iommu_table *tbl; + + /* build TCE tables for each PHB */ + ret = build_tce_table(dev, bbar); + if (ret) + return ret; + + tbl = dev->sysdata; + tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; + tce_free(tbl, 0, tbl->it_size); + + calgary_reserve_regions(dev); + + /* set TARs for each PHB */ + target = calgary_reg(bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + + /* zero out all TAR bits under sw control */ + val64 &= ~TAR_SW_BITS; + + tbl = dev->sysdata; + table_phys = (u64)__pa(tbl->it_base); + val64 |= table_phys; + + BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); + val64 |= (u64) specified_table_size; + + tbl->tar_val = cpu_to_be64(val64); + writeq(tbl->tar_val, target); + readq(target); /* flush */ + + return 0; +} + +static void __init calgary_free_bus(struct pci_dev *dev) +{ + u64 val64; + struct iommu_table *tbl = dev->sysdata; + void __iomem *target; + unsigned int bitmapsz; + + target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); + val64 = be64_to_cpu(readq(target)); + val64 &= ~TAR_SW_BITS; + writeq(cpu_to_be64(val64), target); + readq(target); /* flush */ + + bitmapsz = tbl->it_size / BITS_PER_BYTE; + free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); + tbl->it_map = NULL; + + kfree(tbl); + dev->sysdata = NULL; + + /* Can't free bootmem allocated memory after system is up :-( */ + bus_info[dev->bus->number].tce_space = NULL; +} + +static void calgary_watchdog(unsigned long data) +{ + struct pci_dev *dev = (struct pci_dev *)data; + struct iommu_table *tbl = dev->sysdata; + void __iomem *bbar = tbl->bbar; + u32 val32; + void __iomem *target; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + val32 = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + if (val32 & CSR_AGENT_MASK) { + printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, " + "CSR = %#x\n", dev->bus->number, val32); + writel(0, target); + + /* Disable bus that caused the error */ + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | + PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_SLOT_DISABLE; + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + } else { + /* Reset the timer */ + mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); + } +} + +static void __init calgary_enable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = dev->sysdata; + bbar = tbl->bbar; + + /* enable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; + + printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum); + printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " + "bus.\n"); + + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + init_timer(&tbl->watchdog_timer); + tbl->watchdog_timer.function = &calgary_watchdog; + tbl->watchdog_timer.data = (unsigned long)dev; + mod_timer(&tbl->watchdog_timer, jiffies); +} + +static void __init calgary_disable_translation(struct pci_dev *dev) +{ + u32 val32; + unsigned char busnum; + void __iomem *target; + void __iomem *bbar; + struct iommu_table *tbl; + + busnum = dev->bus->number; + tbl = dev->sysdata; + bbar = tbl->bbar; + + /* disable TCE in PHB Config Register */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); + val32 = be32_to_cpu(readl(target)); + val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); + + printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum); + writel(cpu_to_be32(val32), target); + readl(target); /* flush */ + + del_timer_sync(&tbl->watchdog_timer); +} + +static inline unsigned int __init locate_register_space(struct pci_dev *dev) +{ + int rionodeid; + u32 address; + + rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2; + /* + * register space address calculation as follows: + * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase) + * ChassisBase is always zero for x366/x260/x460 + * RioNodeId is 2 for first Calgary, 3 for second Calgary + */ + address = START_ADDRESS - + (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) + + (0x100000) * (rionodeid - CHASSIS_BASE); + return address; +} + +static void __init calgary_init_one_nontraslated(struct pci_dev *dev) +{ + pci_dev_get(dev); + dev->sysdata = NULL; + dev->bus->self = dev; +} + +static int __init calgary_init_one(struct pci_dev *dev) +{ + u32 address; + void __iomem *bbar; + int ret; + + address = locate_register_space(dev); + /* map entire 1MB of Calgary config space */ + bbar = ioremap_nocache(address, 1024 * 1024); + if (!bbar) { + ret = -ENODATA; + goto done; + } + + ret = calgary_setup_tar(dev, bbar); + if (ret) + goto iounmap; + + pci_dev_get(dev); + dev->bus->self = dev; + calgary_enable_translation(dev); + + return 0; + +iounmap: + iounmap(bbar); +done: + return ret; +} + +static int __init calgary_init(void) +{ + int i, ret = -ENODEV; + struct pci_dev *dev = NULL; + + for (i = 0; i < MAX_PHB_BUS_NUM; i++) { + dev = pci_get_device(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_CALGARY, + dev); + if (!dev) + break; + if (!translate_phb(dev)) { + calgary_init_one_nontraslated(dev); + continue; + } + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + continue; + + ret = calgary_init_one(dev); + if (ret) + goto error; + } + + return ret; + +error: + for (i--; i >= 0; i--) { + dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_CALGARY, + dev); + if (!dev) + break; + if (!translate_phb(dev)) { + pci_dev_put(dev); + continue; + } + if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + continue; + + calgary_disable_translation(dev); + calgary_free_bus(dev); + pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + } + + return ret; +} + +static inline int __init determine_tce_table_size(u64 ram) +{ + int ret; + + if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) + return specified_table_size; + + /* + * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to + * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each + * larger table size has twice as many entries, so shift the + * max ram address by 13 to divide by 8K and then look at the + * order of the result to choose between 0-7. + */ + ret = get_order(ram >> 13); + if (ret > TCE_TABLE_SIZE_8M) + ret = TCE_TABLE_SIZE_8M; + + return ret; +} + +void __init detect_calgary(void) +{ + u32 val; + int bus; + void *tbl; + int calgary_found = 0; + int phb = -1; + + /* + * if the user specified iommu=off or iommu=soft or we found + * another HW IOMMU already, bail out. + */ + if (swiotlb || no_iommu || iommu_detected) + return; + + if (!early_pci_allowed()) + return; + + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + int dev; + struct calgary_bus_info *info = &bus_info[bus]; + info->phbid = -1; + + if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) + continue; + + /* + * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) + * it is connected to releative to the clagary chip. + */ + phb = (phb + 1) % PHBS_PER_CALGARY; + + if (info->translation_disabled) + continue; + + /* + * Scan the slots of the PCI bus to see if there is a device present. + * The parent bus will be the zero-ith device, so start at 1. + */ + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff || translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + info->phbid = phb; + calgary_found = 1; + break; + } + } + } + + if (calgary_found) { + iommu_detected = 1; + calgary_detected = 1; + printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " + "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, + debugging ? "enabled" : "disabled"); + } + return; + +cleanup: + for (--bus; bus >= 0; --bus) { + struct calgary_bus_info *info = &bus_info[bus]; + + if (info->tce_space) + free_tce_table(info->tce_space); + } +} + +int __init calgary_iommu_init(void) +{ + int ret; + + if (no_iommu || swiotlb) + return -ENODEV; + + if (!calgary_detected) + return -ENODEV; + + /* ok, we're trying to use Calgary - let's roll */ + printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + + ret = calgary_init(); + if (ret) { + printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " + "falling back to no_iommu\n", ret); + if (end_pfn > MAX_DMA32_PFN) + printk(KERN_ERR "WARNING more than 4GB of memory, " + "32bit PCI may malfunction.\n"); + return ret; + } + + force_iommu = 1; + dma_ops = &calgary_dma_ops; + + return 0; +} + +static int __init calgary_parse_options(char *p) +{ + unsigned int bridge; + size_t len; + char* endp; + + while (*p) { + if (!strncmp(p, "64k", 3)) + specified_table_size = TCE_TABLE_SIZE_64K; + else if (!strncmp(p, "128k", 4)) + specified_table_size = TCE_TABLE_SIZE_128K; + else if (!strncmp(p, "256k", 4)) + specified_table_size = TCE_TABLE_SIZE_256K; + else if (!strncmp(p, "512k", 4)) + specified_table_size = TCE_TABLE_SIZE_512K; + else if (!strncmp(p, "1M", 2)) + specified_table_size = TCE_TABLE_SIZE_1M; + else if (!strncmp(p, "2M", 2)) + specified_table_size = TCE_TABLE_SIZE_2M; + else if (!strncmp(p, "4M", 2)) + specified_table_size = TCE_TABLE_SIZE_4M; + else if (!strncmp(p, "8M", 2)) + specified_table_size = TCE_TABLE_SIZE_8M; + + len = strlen("translate_empty_slots"); + if (!strncmp(p, "translate_empty_slots", len)) + translate_empty_slots = 1; + + len = strlen("disable"); + if (!strncmp(p, "disable", len)) { + p += len; + if (*p == '=') + ++p; + if (*p == '\0') + break; + bridge = simple_strtol(p, &endp, 0); + if (p == endp) + break; + + if (bridge < MAX_PHB_BUS_NUM) { + printk(KERN_INFO "Calgary: disabling " + "translation for PHB 0x%x\n", bridge); + bus_info[bridge].translation_disabled = 1; + } + } + + p = strpbrk(p, ","); + if (!p) + break; + + p++; /* skip ',' */ + } + return 1; +} +__setup("calgary=", calgary_parse_options); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index a9275c9557cf..f8d857453f8a 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -9,6 +9,7 @@ #include <linux/module.h> #include <asm/io.h> #include <asm/proto.h> +#include <asm/calgary.h> int iommu_merge __read_mostly = 0; EXPORT_SYMBOL(iommu_merge); @@ -33,12 +34,15 @@ int panic_on_overflow __read_mostly = 0; int force_iommu __read_mostly= 0; #endif +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; + /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to i386. */ struct device fallback_dev = { .bus_id = "fallback device", - .coherent_dma_mask = 0xffffffff, + .coherent_dma_mask = DMA_32BIT_MASK, .dma_mask = &fallback_dev.coherent_dma_mask, }; @@ -77,7 +81,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, dev = &fallback_dev; dma_mask = dev->coherent_dma_mask; if (dma_mask == 0) - dma_mask = 0xffffffff; + dma_mask = DMA_32BIT_MASK; /* Don't invoke OOM killer */ gfp |= __GFP_NORETRY; @@ -90,7 +94,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, larger than 16MB and in this case we have a chance of finding fitting memory in the next higher zone first. If not retry with true GFP_DMA. -AK */ - if (dma_mask <= 0xffffffff) + if (dma_mask <= DMA_32BIT_MASK) gfp |= GFP_DMA32; again: @@ -111,7 +115,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Don't use the 16MB ZONE_DMA unless absolutely needed. It's better to use remapping first. */ - if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) { + if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -166,15 +170,27 @@ void dma_free_coherent(struct device *dev, size_t size, } EXPORT_SYMBOL(dma_free_coherent); +static int forbid_dac __read_mostly; + int dma_supported(struct device *dev, u64 mask) { +#ifdef CONFIG_PCI + if (mask > 0xffffffff && forbid_dac > 0) { + + + + printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id); + return 0; + } +#endif + if (dma_ops->dma_supported) return dma_ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. The caller just has to use GFP_DMA in this case. */ - if (mask < 0x00ffffff) + if (mask < DMA_24BIT_MASK) return 0; /* Tell the device to use SAC when IOMMU force is on. This @@ -189,7 +205,7 @@ int dma_supported(struct device *dev, u64 mask) SAC for these. Assume all masks <= 40 bits are of this type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ - if (iommu_sac_force && (mask >= 0xffffffffffULL)) { + if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask); return 0; } @@ -227,52 +243,99 @@ EXPORT_SYMBOL(dma_set_mask); allowed overwrite iommu off workarounds for specific chipsets. soft Use software bounce buffering (default for Intel machines) noaperture Don't touch the aperture for AGP. + allowdac Allow DMA >4GB + nodac Forbid DMA >4GB + panic Force panic when IOMMU overflows */ __init int iommu_setup(char *p) { - iommu_merge = 1; - - while (*p) { - if (!strncmp(p,"off",3)) - no_iommu = 1; - /* gart_parse_options has more force support */ - if (!strncmp(p,"force",5)) - force_iommu = 1; - if (!strncmp(p,"noforce",7)) { - iommu_merge = 0; - force_iommu = 0; - } - - if (!strncmp(p, "biomerge",8)) { - iommu_bio_merge = 4096; - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "panic",5)) - panic_on_overflow = 1; - if (!strncmp(p, "nopanic",7)) - panic_on_overflow = 0; - if (!strncmp(p, "merge",5)) { - iommu_merge = 1; - force_iommu = 1; - } - if (!strncmp(p, "nomerge",7)) - iommu_merge = 0; - if (!strncmp(p, "forcesac",8)) - iommu_sac_force = 1; + iommu_merge = 1; + + if (!p) + return -EINVAL; + + while (*p) { + if (!strncmp(p,"off",3)) + no_iommu = 1; + /* gart_parse_options has more force support */ + if (!strncmp(p,"force",5)) + force_iommu = 1; + if (!strncmp(p,"noforce",7)) { + iommu_merge = 0; + force_iommu = 0; + } + + if (!strncmp(p, "biomerge",8)) { + iommu_bio_merge = 4096; + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "panic",5)) + panic_on_overflow = 1; + if (!strncmp(p, "nopanic",7)) + panic_on_overflow = 0; + if (!strncmp(p, "merge",5)) { + iommu_merge = 1; + force_iommu = 1; + } + if (!strncmp(p, "nomerge",7)) + iommu_merge = 0; + if (!strncmp(p, "forcesac",8)) + iommu_sac_force = 1; + if (!strncmp(p, "allowdac", 8)) + forbid_dac = 0; + if (!strncmp(p, "nodac", 5)) + forbid_dac = -1; + +#ifdef CONFIG_SWIOTLB + if (!strncmp(p, "soft",4)) + swiotlb = 1; +#endif + +#ifdef CONFIG_IOMMU + gart_parse_options(p); +#endif + + p += strcspn(p, ","); + if (*p == ',') + ++p; + } + return 0; +} +early_param("iommu", iommu_setup); + +void __init pci_iommu_alloc(void) +{ + /* + * The order of these functions is important for + * fall-back/fail-over reasons + */ +#ifdef CONFIG_IOMMU + iommu_hole_init(); +#endif + +#ifdef CONFIG_CALGARY_IOMMU + detect_calgary(); +#endif #ifdef CONFIG_SWIOTLB - if (!strncmp(p, "soft",4)) - swiotlb = 1; + pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ +#ifdef CONFIG_CALGARY_IOMMU + calgary_iommu_init(); #endif -#ifdef CONFIG_GART_IOMMU - gart_parse_options(p); +#ifdef CONFIG_IOMMU + gart_iommu_init(); #endif - p += strcspn(p, ","); - if (*p == ',') - ++p; - } - return 1; + no_iommu_init(); + return 0; } + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 82a7c9bfdfa0..16261a8a3303 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -10,7 +10,6 @@ * Copyright 2002 Andi Kleen, SuSE Labs. */ -#include <linux/config.h> #include <linux/types.h> #include <linux/ctype.h> #include <linux/agp_backend.h> @@ -32,6 +31,7 @@ #include <asm/kdebug.h> #include <asm/swiotlb.h> #include <asm/dma.h> +#include <asm/k8.h> unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -46,8 +46,6 @@ u32 *iommu_gatt_base; /* Remapping table */ also seen with Qlogic at least). */ int iommu_fullflush = 1; -#define MAX_NB 8 - /* Allocation bitmap for the remapping area */ static DEFINE_SPINLOCK(iommu_bitmap_lock); static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ @@ -63,13 +61,6 @@ static u32 gart_unmapped_entry; #define to_pages(addr,size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define for_all_nb(dev) \ - dev = NULL; \ - while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL) - -static struct pci_dev *northbridges[MAX_NB]; -static u32 northbridge_flush_word[MAX_NB]; - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -93,7 +84,7 @@ static unsigned long alloc_iommu(int size) offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); + offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); } if (offset != -1) { set_bit_string(iommu_gart_bitmap, offset, size); @@ -120,44 +111,17 @@ static void free_iommu(unsigned long offset, int size) /* * Use global flush state to avoid races with multiple flushers. */ -static void flush_gart(struct device *dev) +static void flush_gart(void) { unsigned long flags; - int flushed = 0; - int i, max; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - if (need_flush) { - max = 0; - for (i = 0; i < MAX_NB; i++) { - if (!northbridges[i]) - continue; - pci_write_config_dword(northbridges[i], 0x9c, - northbridge_flush_word[i] | 1); - flushed++; - max = i; - } - for (i = 0; i <= max; i++) { - u32 w; - if (!northbridges[i]) - continue; - /* Make sure the hardware actually executed the flush. */ - for (;;) { - pci_read_config_dword(northbridges[i], 0x9c, &w); - if (!(w & 1)) - break; - cpu_relax(); - } - } - if (!flushed) - printk("nothing to flush?\n"); + if (need_flush) { + k8_flush_garts(); need_flush = 0; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } - - #ifdef CONFIG_IOMMU_LEAK #define SET_LEAK(x) if (iommu_leak_tab) \ @@ -266,7 +230,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf, size_t size, int dir) { dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); - flush_gart(dev); + flush_gart(); return map; } @@ -275,8 +239,6 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; - BUG_ON(dir == DMA_NONE); - if (!dev) dev = &fallback_dev; @@ -289,6 +251,28 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) } /* + * Free a DMA mapping. + */ +void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + unsigned long iommu_page; + int npages; + int i; + + if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || + dma_addr >= iommu_bus_base + iommu_size) + return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; + npages = to_pages(dma_addr, size); + for (i = 0; i < npages; i++) { + iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; + CLEAR_LEAK(iommu_page + i); + } + free_iommu(iommu_page, npages); +} + +/* * Wrapper for pci_unmap_single working with scatterlists. */ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) @@ -299,7 +283,7 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di struct scatterlist *s = &sg[i]; if (!s->dma_length || !s->length) break; - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); + gart_unmap_single(dev, s->dma_address, s->dma_length, dir); } } @@ -329,7 +313,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, s->dma_address = addr; s->dma_length = s->length; } - flush_gart(dev); + flush_gart(); return nents; } @@ -397,7 +381,6 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) unsigned long pages = 0; int need = 0, nextneed; - BUG_ON(dir == DMA_NONE); if (nents == 0) return 0; @@ -436,13 +419,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0) goto error; out++; - flush_gart(dev); + flush_gart(); if (out < nents) sg[out].dma_length = 0; return out; error: - flush_gart(NULL); + flush_gart(); gart_unmap_sg(dev, sg, nents, dir); /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { @@ -458,28 +441,6 @@ error: return 0; } -/* - * Free a DMA mapping. - */ -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - unsigned long iommu_page; - int npages; - int i; - - if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || - dma_addr >= iommu_bus_base + iommu_size) - return; - iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); - for (i = 0; i < npages; i++) { - iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); - } - free_iommu(iommu_page, npages); -} - static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -532,10 +493,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) void *gatt; unsigned aper_base, new_aper_base; unsigned aper_size, gatt_size, new_aper_size; - + int i; + printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; - for_all_nb(dev) { + dev = NULL; + for (i = 0; i < num_k8_northbridges; i++) { + dev = k8_northbridges[i]; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -558,11 +522,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) panic("Cannot allocate GATT table"); memset(gatt, 0, gatt_size); agp_gatt_table = gatt; - - for_all_nb(dev) { + + for (i = 0; i < num_k8_northbridges; i++) { u32 ctl; u32 gatt_reg; + dev = k8_northbridges[i]; gatt_reg = __pa(gatt) >> 12; gatt_reg <<= 4; pci_write_config_dword(dev, 0x98, gatt_reg); @@ -573,7 +538,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } - flush_gart(NULL); + flush_gart(); printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); return 0; @@ -602,15 +567,19 @@ static struct dma_mapping_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, }; -static int __init pci_iommu_init(void) +void __init gart_iommu_init(void) { struct agp_kern_info info; unsigned long aper_size; unsigned long iommu_start; - struct pci_dev *dev; unsigned long scratch; long i; + if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { + printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); + return; + } + #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else @@ -622,7 +591,11 @@ static int __init pci_iommu_init(void) #endif if (swiotlb) - return -1; + return; + + /* Did we detect a different HW IOMMU? */ + if (iommu_detected && !iommu_aperture) + return; if (no_iommu || (!force_iommu && end_pfn <= MAX_DMA32_PFN) || @@ -634,15 +607,7 @@ static int __init pci_iommu_init(void) "but IOMMU not available.\n" KERN_ERR "WARNING 32bit PCI may malfunction.\n"); } - return -1; - } - - i = 0; - for_all_nb(dev) - i++; - if (i > MAX_NB) { - printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); - return -1; + return; } printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); @@ -707,26 +672,10 @@ static int __init pci_iommu_init(void) for (i = EMERGENCY_PAGES; i < iommu_pages; i++) iommu_gatt_base[i] = gart_unmapped_entry; - for_all_nb(dev) { - u32 flag; - int cpu = PCI_SLOT(dev->devfn) - 24; - if (cpu >= MAX_NB) - continue; - northbridges[cpu] = dev; - pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ - northbridge_flush_word[cpu] = flag; - } - - flush_gart(NULL); - + flush_gart(); dma_ops = &gart_dma_ops; - - return 0; } -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); - void gart_parse_options(char *p) { int arg; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 1f6ecc62061d..df09ab05a1bd 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -4,6 +4,8 @@ #include <linux/init.h> #include <linux/pci.h> #include <linux/string.h> +#include <linux/dma-mapping.h> + #include <asm/proto.h> #include <asm/processor.h> #include <asm/dma.h> @@ -12,10 +14,11 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { if (hwdev && bus + size > *hwdev->dma_mask) { - if (*hwdev->dma_mask >= 0xffffffffULL) + if (*hwdev->dma_mask >= DMA_32BIT_MASK) printk(KERN_ERR - "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", - name, (long long)bus, size, (long long)*hwdev->dma_mask); + "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", + name, (long long)bus, size, + (long long)*hwdev->dma_mask); return 0; } return 1; @@ -56,7 +59,6 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, { int i; - BUG_ON(direction == DMA_NONE); for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); @@ -89,5 +91,7 @@ void __init no_iommu_init(void) { if (dma_ops) return; + + force_iommu = 0; /* no HW IOMMU */ dma_ops = &nommu_dma_ops; } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 990ed67896f2..697f0aa794b9 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -3,7 +3,8 @@ #include <linux/pci.h> #include <linux/cache.h> #include <linux/module.h> -#include <asm/dma-mapping.h> +#include <linux/dma-mapping.h> + #include <asm/proto.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -31,9 +32,10 @@ struct dma_mapping_ops swiotlb_dma_ops = { void pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!iommu_aperture && !no_iommu && - (end_pfn > MAX_DMA32_PFN || force_iommu)) + if (!iommu_detected && !no_iommu && end_pfn > MAX_DMA32_PFN) swiotlb = 1; + if (swiotlb_force) + swiotlb = 1; if (swiotlb) { printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); swiotlb_init(); diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index bf421ed26808..7554458dc9cb 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -27,7 +27,7 @@ /* The I/O port the PMTMR resides at. * The location is detected during setup_arch(), * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; +u32 pmtmr_ioport __read_mostly; /* value of the Power timer at last timer interrupt */ static u32 offset_delay; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fb903e65e079..458006ae19f3 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -10,7 +10,6 @@ * Andi Kleen. * * CPU hotplug support - ashok.raj@intel.com - * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ */ /* @@ -64,6 +63,7 @@ EXPORT_SYMBOL(boot_option_idle_override); * Powermanagement idle function, if any.. */ void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -80,25 +80,25 @@ void idle_notifier_unregister(struct notifier_block *n) } EXPORT_SYMBOL(idle_notifier_unregister); -enum idle_state { CPU_IDLE, CPU_NOT_IDLE }; -static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE; - void enter_idle(void) { - __get_cpu_var(idle_state) = CPU_IDLE; + write_pda(isidle, 1); atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); } static void __exit_idle(void) { - __get_cpu_var(idle_state) = CPU_NOT_IDLE; + if (read_pda(isidle) == 0) + return; + write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } /* Called from interrupts to signify idle end */ void exit_idle(void) { - if (current->pid | read_pda(irqcount)) + /* idle loop has pid 0 */ + if (current->pid) return; __exit_idle(); } @@ -111,7 +111,7 @@ static void default_idle(void) { local_irq_enable(); - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); @@ -120,7 +120,7 @@ static void default_idle(void) else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } /* @@ -203,8 +203,7 @@ static inline void play_dead(void) */ void cpu_idle (void) { - set_thread_flag(TIF_POLLING_NRFLAG); - + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -221,6 +220,9 @@ void cpu_idle (void) play_dead(); enter_idle(); idle(); + /* In many cases the interrupt that ended idle + has already called exit_idle. But some idle + loops can be woken up without interrupt. */ __exit_idle(); } @@ -297,7 +299,7 @@ void __show_regs(struct pt_regs * regs) system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); @@ -335,7 +337,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(®s->rsp); + show_trace(NULL, regs, (void *)(regs + 1)); } /* @@ -351,6 +353,7 @@ void exit_thread(void) kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -365,8 +368,12 @@ void flush_thread(void) struct task_struct *tsk = current; struct thread_info *t = current_thread_info(); - if (t->flags & _TIF_ABI_PENDING) + if (t->flags & _TIF_ABI_PENDING) { t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); + if (t->flags & _TIF_IA32) + current_thread_info()->status |= TS_COMPAT; + } + t->flags &= ~_TIF_DEBUG; tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; @@ -459,7 +466,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -467,6 +474,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, } memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -496,6 +504,40 @@ out: */ #define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +static inline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread, + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); + loaddebug(next, 2); + loaddebug(next, 3); + /* no 4 and 5 */ + loaddebug(next, 6); + loaddebug(next, 7); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + /* * switch_to(x,y) should switch tasks from x to y. * @@ -513,6 +555,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter>5) + prefetch(&next->i387.fxsave); + /* * Reload esp0, LDT and the page table pointer: */ @@ -581,41 +627,29 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) And the AMD workaround requires it to be after DS reload. */ unlazy_fpu(prev_p); write_pda(kernelstack, - task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); - + (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); +#ifdef CONFIG_CC_STACKPROTECTOR + write_pda(stack_canary, next_p->stack_canary); /* - * Now maybe reload the debug registers + * Build time only check to make sure the stack_canary is at + * offset 40 in the pda; this is a gcc ABI requirement */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } + BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40); +#endif + /* + * Now maybe reload the debug registers and handle I/O bitmaps + */ + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + __switch_to_xtra(prev_p, next_p, tss); - /* - * Handle the IO bitmap - */ - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - if (next->io_bitmap_ptr) - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - else { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } - } - + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter>5) + math_state_restore(); return prev_p; } @@ -832,7 +866,7 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 2d50024c9f30..addc14af0c56 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -116,17 +116,17 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r return addr; } -static int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_rip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* CHECKME: 64 65 */ @@ -138,14 +138,17 @@ static int is_at_popf(struct task_struct *child, struct pt_regs *regs) case 0x26: case 0x2e: case 0x36: case 0x3e: case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: + case 0xf2: case 0xf3: continue; - /* REX prefixes */ case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ continue; - /* CHECKME: f0, f2, f3 */ + /* CHECKME: f2, f3 */ /* * pushf: NOTE! We should probably not let @@ -186,10 +189,8 @@ static void set_singlestep(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. - * - * AK: this is not enough, LAHF and IRET can change TF in user space too. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; @@ -420,9 +421,13 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) break; if (i == 4) { - child->thread.debugreg7 = data; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); ret = 0; - } + } break; } break; diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 57117b8beb2b..2d6769847456 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -20,6 +20,7 @@ * Power off function, if any */ void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static long no_idt[3]; static enum { diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S index d24fa9b72a2b..14e95872c6a3 100644 --- a/arch/x86_64/kernel/relocate_kernel.S +++ b/arch/x86_64/kernel/relocate_kernel.S @@ -7,31 +7,169 @@ */ #include <linux/linkage.h> +#include <asm/page.h> +#include <asm/kexec.h> - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer + /* %rdi indirection_page + * %rsi page_list * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 */ /* zero out flags, and disable interrupts */ pushq $0 popfq - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret + +identity_mapped: + /* store the start address on the stack */ + pushq %rdx /* Set cr0 to a known state: * 31 1 == Paging enabled @@ -136,8 +274,3 @@ relocate_new_kernel: xorq %r15, %r15 ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b8d5116d7371..fc944b5e8f4a 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -5,8 +5,6 @@ * * Nov 2001 Dave Jones <davej@suse.de> * Forked from i386 setup code. - * - * $Id$ */ /* @@ -23,10 +21,9 @@ #include <linux/slab.h> #include <linux/user.h> #include <linux/a.out.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/delay.h> -#include <linux/config.h> #include <linux/init.h> #include <linux/initrd.h> #include <linux/highmem.h> @@ -47,7 +44,6 @@ #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> -#include <linux/suspend.h> #include <asm/mtrr.h> #include <asm/uaccess.h> @@ -66,9 +62,7 @@ #include <asm/setup.h> #include <asm/mach_apic.h> #include <asm/numa.h> -#include <asm/swiotlb.h> #include <asm/sections.h> -#include <asm/gart-mapping.h> #include <asm/dmi.h> /* @@ -76,19 +70,10 @@ */ struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif - -int acpi_numa __initdata; - /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -104,13 +89,14 @@ char dmi_alloc_data[DMI_MAX_DATA]; * Setup options */ struct screen_info screen_info; +EXPORT_SYMBOL(screen_info); struct sys_desc_table_struct { unsigned short length; unsigned char table[0]; }; struct edid_info edid_info; -struct e820map e820; +EXPORT_SYMBOL_GPL(edid_info); extern int root_mountflags; @@ -137,9 +123,6 @@ struct resource standard_io_resources[] = { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) struct resource data_resource = { @@ -186,9 +169,6 @@ static struct resource adapter_rom_resources[] = { .flags = IORESOURCE_ROM } }; -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, @@ -259,7 +239,8 @@ static void __init probe_roms(void) } /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; + start += 2048) { rom = isa_bus_to_virt(start); if (!romsignature(rom)) continue; @@ -279,185 +260,22 @@ static void __init probe_roms(void) } } -/* Check for full argument with no trailing characters */ -static int fullarg(char *p, char *arg) +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. This option will be passed + * by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) { - int l = strlen(arg); - return !memcmp(p, arg, l) && (p[l] == 0 || isspace(p[l])); + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; } - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0; - int userdef = 0; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (fullarg(from,"acpi=off")) - disable_acpi(); - - if (fullarg(from, "acpi=force")) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (fullarg(from, "acpi=ht")) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (fullarg(from, "pci=noacpi")) - acpi_disable_pci(); - else if (fullarg(from, "acpi=noirq")) - acpi_noirq_set(); - - else if (fullarg(from, "acpi_sci=edge")) - acpi_sci_flags.trigger = 1; - else if (fullarg(from, "acpi_sci=level")) - acpi_sci_flags.trigger = 3; - else if (fullarg(from, "acpi_sci=high")) - acpi_sci_flags.polarity = 1; - else if (fullarg(from, "acpi_sci=low")) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (fullarg(from, "acpi=strict")) { - acpi_strict = 1; - } -#ifdef CONFIG_X86_IO_APIC - else if (fullarg(from, "acpi_skip_timer_override")) - acpi_skip_timer_override = 1; -#endif -#endif - - if (fullarg(from, "disable_timer_pin_1")) - disable_timer_pin_1 = 1; - if (fullarg(from, "enable_timer_pin_1")) - disable_timer_pin_1 = -1; - - if (fullarg(from, "nolapic") || fullarg(from, "disableapic")) { - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - disable_apic = 1; - } - - if (fullarg(from, "noapic")) - skip_ioapic_setup = 1; - - if (fullarg(from,"apic")) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - - if (!memcmp(from, "memmap=", 7)) { - /* exactmap option is for used defined memory */ - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - saved_max_pfn = e820_end_of_ram(); -#endif - from += 8+7; - end_pfn_map = 0; - e820.nr_map = 0; - userdef = 1; - } - else { - parse_memmapopt(from+7, &from); - userdef = 1; - } - } - -#ifdef CONFIG_NUMA - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } - - if (fullarg(from,"oops=panic")) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } +early_param("elfcorehdr", setup_elfcorehdr); #endif -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ - else if(!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif - -#ifdef CONFIG_HOTPLUG_CPU - else if (!memcmp(from, "additional_cpus=", 16)) - setup_additional_cpus(from+16); -#endif - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - e820_print_map("user"); - } - *to = '\0'; - *cmdline_p = command_line; -} - #ifndef CONFIG_NUMA static void __init contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) @@ -469,85 +287,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n",bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); - e820_bootmem_free(NODE_DATA(0), 0, end_pfn << PAGE_SHIFT); + e820_register_active_regions(0, start_pfn, end_pfn); + free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); } #endif -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); - -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; - -extern char __vsyscall_0; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - for (a = start; (void *)a < end; a++) { - u8 *instr; - - if (!boot_cpu_has(a->cpuid)) - continue; - - BUG_ON(a->replacementlen > a->instrlen); - instr = a->instr; - /* vsyscall code is not mapped yet. resolve it manually. */ - if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) - instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); - __inline_memcpy(instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - __inline_memcpy(instr + i, k8_nops[k], k); - } - } -} - -static int no_replacement __initdata = 0; - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 1; -} - -__setup("noreplacement", noreplacement_setup); - #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; #ifdef CONFIG_EDD_MODULE @@ -596,103 +341,9 @@ static void discover_ebda(void) ebda_size = 64*1024; } -#ifdef CONFIG_SOFTWARE_SUSPEND -static void __init mark_nosave_page_range(unsigned long start, unsigned long end) -{ - struct page *page; - while (start <= end) { - page = pfn_to_page(start); - SetPageNosave(page); - start++; - } -} - -static void __init e820_nosave_reserved_pages(void) -{ - int i; - unsigned long r_start = 0, r_end = 0; - - /* Assume e820 map is sorted */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_down(ei->addr, PAGE_SIZE); - end = round_up(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RESERVED) - continue; - r_end = start>>PAGE_SHIFT; - /* swsusp ignores invalid pfn, ignore these pages here */ - if (r_end > end_pfn) - r_end = end_pfn; - if (r_end > r_start) - mark_nosave_page_range(r_start, r_end-1); - if (r_end >= end_pfn) - break; - r_start = end>>PAGE_SHIFT; - } -} - -static void __init e820_save_acpi_pages(void) -{ - int i; - - /* Assume e820 map is sorted */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = ei->addr, PAGE_SIZE; - end = ei->addr + ei->size; - if (start >= end) - continue; - if (ei->type != E820_ACPI && ei->type != E820_NVS) - continue; - /* - * If the region is below end_pfn, it will be - * saved/restored by swsusp follow 'RAM' type. - */ - if (start < (end_pfn << PAGE_SHIFT)) - start = end_pfn << PAGE_SHIFT; - if (end > start) - swsusp_add_arch_pages(start, end); - } -} - -extern char __start_rodata, __end_rodata; -/* - * BIOS reserved region/hole - no save/restore - * ACPI NVS - save/restore - * ACPI Data - this is a little tricky, the mem could be used by OS after OS - * reads tables from the region, but anyway save/restore the memory hasn't any - * side effect and Linux runtime module load/unload might use it. - * kernel rodata - no save/restore (kernel rodata isn't changed) - */ -static int __init mark_nosave_pages(void) -{ - unsigned long pfn_start, pfn_end; - - /* BIOS reserved regions & holes */ - e820_nosave_reserved_pages(); - - /* kernel rodata */ - pfn_start = round_up(__pa_symbol(&__start_rodata), PAGE_SIZE) >> PAGE_SHIFT; - pfn_end = round_down(__pa_symbol(&__end_rodata), PAGE_SIZE) >> PAGE_SHIFT; - mark_nosave_page_range(pfn_start, pfn_end-1); - - /* record ACPI Data/NVS as saveable */ - e820_save_acpi_pages(); - - return 0; -} -core_initcall(mark_nosave_pages); -#endif - void __init setup_arch(char **cmdline_p) { - unsigned long kernel_end; + printk(KERN_INFO "Command line: %s\n", saved_command_line); ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); screen_info = SCREEN_INFO; @@ -720,16 +371,22 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; - parse_cmdline_early(cmdline_p); - early_identify_cpu(&boot_cpu_data); + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + + finish_e820_parsing(); + + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); - num_physpages = end_pfn; /* for pfn_valid */ + num_physpages = end_pfn; check_efer(); @@ -749,6 +406,14 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif + /* How many end-of-memory variables you have, grandma! */ + max_low_pfn = end_pfn; + max_pfn = end_pfn; + high_memory = (void *)__va(end_pfn * PAGE_SIZE - 1) + 1; + + /* Remove active ranges so rediscovery with NUMA-awareness happens */ + remove_all_active_ranges(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. @@ -767,8 +432,8 @@ void __init setup_arch(char **cmdline_p) (table_end - table_start) << PAGE_SHIFT); /* reserve kernel */ - kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); + reserve_bootmem_generic(__pa_symbol(&_text), + __pa_symbol(&_end) - __pa_symbol(&_text)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, @@ -798,12 +463,10 @@ void __init setup_arch(char **cmdline_p) */ acpi_reserve_bootmem(); #endif -#ifdef CONFIG_X86_LOCAL_APIC /* * Find and reserve possible boot-time SMP configuration: */ find_smp_config(); -#endif #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { @@ -823,14 +486,16 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) { - reserve_bootmem(crashk_res.start, + reserve_bootmem_generic(crashk_res.start, crashk_res.end - crashk_res.start + 1); } #endif paging_init(); - check_ioapic(); +#ifdef CONFIG_PCI + early_quirks(); +#endif /* * set this early, so we dont allocate cpu0 @@ -847,14 +512,12 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); -#ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: */ if (smp_found_config) get_smp_config(); init_apic_mappings(); -#endif /* * Request address space for all standard RAM and ROM resources @@ -862,22 +525,19 @@ void __init setup_arch(char **cmdline_p) */ probe_roms(); e820_reserve_resources(); + e820_mark_nosave_regions(); request_resource(&iomem_resource, &video_ram_resource); { unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); } e820_setup_gap(); -#ifdef CONFIG_GART_IOMMU - iommu_hole_init(); -#endif - #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; @@ -962,24 +622,32 @@ static int nearby_node(int apicid) static void __init amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - int cpu = smp_processor_id(); unsigned bits; #ifdef CONFIG_NUMA + int cpu = smp_processor_id(); int node = 0; unsigned apicid = hard_smp_processor_id(); #endif + unsigned ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; - bits = 0; - while ((1 << bits) < c->x86_max_cores) - bits++; + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } /* Low order bits define the core id (index of core in socket) */ - cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); /* Convert the APIC ID into the socket ID */ - phys_proc_id[cpu] = phys_pkg_id(bits); + c->phys_proc_id = phys_pkg_id(bits); #ifdef CONFIG_NUMA - node = phys_proc_id[cpu]; + node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; if (!node_online(node)) { @@ -992,7 +660,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) but in the same order as the HT nodeids. If that doesn't result in a usable node fall back to the path for the previous case. */ - int ht_nodeid = apicid - (phys_proc_id[0] << bits); + int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits); if (ht_nodeid >= 0 && apicid_to_node[ht_nodeid] != NUMA_NO_NODE) node = apicid_to_node[ht_nodeid]; @@ -1002,15 +670,13 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) } numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n", - cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif #endif } -static int __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { - int r; unsigned level; #ifdef CONFIG_SMP @@ -1043,8 +709,8 @@ static int __init init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); - r = get_model_name(c); - if (!r) { + level = get_model_name(c); + if (!level) { switch (c->x86) { case 15: /* Should distinguish Models here, but this is only @@ -1059,13 +725,18 @@ static int __init init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - if (c->extended_cpuid_level >= 0x80000008) { - c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - } - return r; + /* Fix cpuid4 emulation for more */ + num_cache_leaves = 3; + + /* When there is only one core no need to synchronize RDTSC */ + if (num_possible_cpus() == 1) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -1073,13 +744,14 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP u32 eax, ebx, ecx, edx; int index_msb, core_bits; - int cpu = smp_processor_id(); cpuid(1, &eax, &ebx, &ecx, &edx); - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (!cpu_has(c, X86_FEATURE_HT)) return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; @@ -1094,10 +766,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) } index_msb = get_count_order(smp_num_siblings); - phys_proc_id[cpu] = phys_pkg_id(index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); + c->phys_proc_id = phys_pkg_id(index_msb); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -1105,13 +774,15 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id(index_msb) & + c->cpu_core_id = phys_pkg_id(index_msb) & ((1 << core_bits) - 1); - - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - cpu_core_id[cpu]); } +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + } + #endif } @@ -1120,15 +791,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) */ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { - unsigned int eax; + unsigned int eax, t; if (c->cpuid_level < 4) return 1; - __asm__("cpuid" - : "=a" (eax) - : "0" (4), "c" (0) - : "bx", "dx"); + cpuid_count(4, 0, &eax, &t, &t, &t); if (eax & 0x1f) return ((eax >> 26) + 1); @@ -1141,16 +809,16 @@ static void srat_detect_node(void) #ifdef CONFIG_NUMA unsigned node; int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); /* Don't do the funky fallback heuristics the AMD version employs for now. */ - node = apicid_to_node[hard_smp_processor_id()]; + node = apicid_to_node[apicid]; if (node == NUMA_NO_NODE) node = first_node(node_online_map); numa_set_node(cpu, node); - if (acpi_numa > 0) - printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif } @@ -1160,6 +828,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned n; init_intel_cacheinfo(c); + if (c->cpuid_level > 9 ) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + } + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -1177,6 +852,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + if (c->x86 == 6) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); @@ -1251,7 +928,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_SMP - phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif } @@ -1378,7 +1055,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", + NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow", /* Transmeta-defined */ "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -1389,14 +1066,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Other (Linux-defined) */ "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, "constant_tsc", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -1459,9 +1136,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { int cpu = c - cpu_data; - seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); - seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif @@ -1535,7 +1212,7 @@ struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; -#ifdef CONFIG_INPUT_PCSPKR +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE) #include <linux/platform_device.h> static __init int add_pcspkr(void) { diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8a691fa6d393..8c4b80fe71a1 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -3,9 +3,7 @@ * Copyright (C) 1995 Linus Torvalds * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. * See setup.c for older changelog. - * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ */ -#include <linux/config.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -26,11 +24,12 @@ #include <asm/proto.h> #include <asm/sections.h> -char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; +char x86_boot_params[BOOT_PARAM_SIZE] __initdata; cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda); struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned; struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -38,6 +37,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL(__supported_pte_mask); static int do_not_nx __cpuinitdata = 0; /* noexec=on|off @@ -46,8 +46,10 @@ Control non executable mappings for 64bit processes. on Enable(default) off Disable */ -int __init nonx_setup(char *str) +static int __init nonx_setup(char *str) { + if (!str) + return -EINVAL; if (!strncmp(str, "on", 2)) { __supported_pte_mask |= _PAGE_NX; do_not_nx = 0; @@ -55,9 +57,9 @@ int __init nonx_setup(char *str) do_not_nx = 1; __supported_pte_mask &= ~_PAGE_NX; } - return 1; + return 0; } -__setup("noexec=", nonx_setup); /* parsed early actually */ +early_param("noexec", nonx_setup); int force_personality32 = 0; @@ -93,12 +95,9 @@ void __init setup_per_cpu_areas(void) #endif /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + size = PERCPU_ENOUGH_ROOM; + printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size); for_each_cpu_mask (i, cpu_possible_map) { char *ptr; @@ -122,7 +121,10 @@ void pda_init(int cpu) /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); + /* Memory clobbers used to order PDA accessed */ + mb(); wrmsrl(MSR_GS_BASE, pda); + mb(); pda->cpunumber = cpu; pda->irqcount = -1; @@ -178,6 +180,8 @@ void __cpuinit check_efer(void) } } +unsigned long kernel_eflags; + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -189,6 +193,7 @@ void __cpuinit cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); unsigned long v; char *estacks = NULL; struct task_struct *me; @@ -234,29 +239,18 @@ void __cpuinit cpu_init (void) * set up and load the per-CPU TSS */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; if (cpu) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - switch (v + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - cpu_pda(cpu)->debugstack = (unsigned long)estacks; - estacks += DEBUG_STKSZ; - break; -#endif - default: - estacks += EXCEPTION_STKSZ; - break; - } - t->ist[v] = (unsigned long)estacks; + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; } t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); @@ -289,4 +283,6 @@ void __cpuinit cpu_init (void) set_debugreg(0UL, 7); fpu_init(); + + raw_local_save_flags(kernel_eflags); } diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index e5f5ce7909a3..49ec324cd141 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -7,8 +7,6 @@ * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen - * - * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ */ #include <linux/sched.h> @@ -40,37 +38,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct pt_regs * regs); asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { @@ -239,7 +206,6 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) rsp = regs->rsp - 128; /* This is the X/Open sanctioned signal stack switching. */ - /* RED-PEN: redzone on that stack? */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(rsp) == 0) rsp = current->sas_ss_sp + current->sas_ss_size; @@ -311,11 +277,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } regs->rdi = sig; /* In case the signal handler was declared without prototypes */ regs->rax = 0; @@ -344,11 +305,11 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif - return 1; + return 0; give_sigsegv: force_sigsegv(sig, current); - return 0; + return -EFAULT; } /* @@ -411,7 +372,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret) { + if (ret == 0) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) @@ -428,11 +389,12 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) +static void do_signal(struct pt_regs *regs) { struct k_sigaction ka; siginfo_t info; int signr; + sigset_t *oldset; /* * We want the common case to go fast, which @@ -441,9 +403,11 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) * if so. */ if (!user_mode(regs)) - return 1; + return; - if (!oldset) + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else oldset = ¤t->blocked; signr = get_signal_to_deliver(&info, &ka, regs, NULL); @@ -457,30 +421,46 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset) set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ - return handle_signal(signr, &info, &ka, oldset, regs); + if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { + /* a signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag */ + clear_thread_flag(TIF_RESTORE_SIGMASK); + } + return; } /* Did we come from a system call? */ if ((long)regs->orig_rax >= 0) { /* Restart the system call - no handlers present */ long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { + switch (res) { + case -ERESTARTNOHAND: + case -ERESTARTSYS: + case -ERESTARTNOINTR: regs->rax = regs->orig_rax; regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { + break; + case -ERESTART_RESTARTBLOCK: regs->rax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; regs->rip -= 2; + break; } } - return 0; + + /* if there's no signal to deliver, we just put the saved sigmask + back. */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", @@ -494,8 +474,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_ } /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); + if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) + do_signal(regs); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 4a6628b14d99..4f67697f5036 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -135,10 +135,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) cpu = smp_processor_id(); /* - * orig_rax contains the interrupt vector - 256. + * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) @@ -203,7 +203,7 @@ int __cpuinit init_smp_flush(void) { int i; for_each_cpu_mask(i, cpu_possible_map) { - spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } @@ -224,6 +224,7 @@ void flush_tlb_current_task(void) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -244,6 +245,7 @@ void flush_tlb_mm (struct mm_struct * mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -266,6 +268,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -443,6 +446,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, spin_unlock(&call_lock); return 0; } +EXPORT_SYMBOL(smp_call_function); void smp_stop_cpu(void) { @@ -460,7 +464,7 @@ static void smp_really_stop_cpu(void *dummy) { smp_stop_cpu(); for (;;) - asm("hlt"); + halt(); } void smp_send_stop(void) @@ -470,7 +474,7 @@ void smp_send_stop(void) return; /* Don't deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { - /* ignore locking because we have paniced anyways */ + /* ignore locking because we have panicked anyways */ nolock = 1; } __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); @@ -518,26 +522,3 @@ asmlinkage void smp_call_function_interrupt(void) } } -int safe_smp_processor_id(void) -{ - int apicid, i; - - if (disable_apic) - return 0; - - apicid = hard_smp_processor_id(); - if (x86_cpu_to_apicid[apicid] == apicid) - return apicid; - - for (i = 0; i < NR_CPUS; ++i) { - if (x86_cpu_to_apicid[i] == apicid) - return i; - } - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return 0; /* Should not happen */ -} diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 71a7222cf9ce..7b7a6870288a 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -38,7 +38,6 @@ */ -#include <linux/config.h> #include <linux/init.h> #include <linux/mm.h> @@ -47,9 +46,10 @@ #include <linux/bootmem.h> #include <linux/thread_info.h> #include <linux/module.h> - #include <linux/delay.h> #include <linux/mc146818rtc.h> +#include <linux/smp.h> + #include <asm/mtrr.h> #include <asm/pgalloc.h> #include <asm/desc.h> @@ -63,13 +63,11 @@ /* Number of siblings per CPU package */ int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; -/* core ID of each logical CPU */ -u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; +EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -82,18 +80,21 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); /* Set when the idlers are all forked */ int smp_threads_ready; /* representing HT siblings of each logical CPU */ cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map); /* representing HT and core siblings of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; @@ -454,10 +455,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ @@ -472,8 +475,8 @@ static inline void set_cpu_sibling_map(int cpu) if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (phys_proc_id[cpu] == phys_proc_id[i] && - cpu_core_id[cpu] == cpu_core_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); @@ -500,7 +503,7 @@ static inline void set_cpu_sibling_map(int cpu) cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } - if (phys_proc_id[cpu] == phys_proc_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); /* @@ -769,7 +772,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) unsigned long start_rip; struct create_idle c_idle = { .cpu = cpu, - .done = COMPLETION_INITIALIZER(c_idle.done), + .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; DECLARE_WORK(work, do_fork_idle, &c_idle); @@ -797,6 +800,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) } + alternatives_smp_switch(1); + c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { @@ -1086,7 +1091,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* * Switch from PIC to APIC mode. */ - connect_bsp_APIC(); setup_local_APIC(); if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { @@ -1171,12 +1175,9 @@ int __cpuinit __cpu_up(unsigned int cpu) void __init smp_cpus_done(unsigned int max_cpus) { smp_cleanup_boot(); - -#ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); -#endif - check_nmi_watchdog(); + time_init_gtod(); } #ifdef CONFIG_HOTPLUG_CPU @@ -1199,8 +1200,8 @@ static void remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } @@ -1229,6 +1230,8 @@ int __cpu_disable(void) if (cpu == 0) return -EBUSY; + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* @@ -1259,6 +1262,8 @@ void __cpu_die(unsigned int cpu) /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { printk ("CPU %d is now offline\n", cpu); + if (1 == num_online_cpus()) + alternatives_smp_switch(0); return; } msleep(100); @@ -1266,11 +1271,11 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -__init int setup_additional_cpus(char *s) +static __init int setup_additional_cpus(char *s) { - return get_option(&s, &additional_cpus); + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; } -__setup("additional_cpus=", setup_additional_cpus); +early_param("additional_cpus", setup_additional_cpus); #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86_64/kernel/stacktrace.c b/arch/x86_64/kernel/stacktrace.c new file mode 100644 index 000000000000..6026b31d037e --- /dev/null +++ b/arch/x86_64/kernel/stacktrace.c @@ -0,0 +1,55 @@ +/* + * arch/x86_64/kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + */ +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include <linux/module.h> +#include <asm/stacktrace.h> + +static void save_stack_warning(void *data, char *msg) +{ +} + +static void +save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) +{ +} + +static int save_stack_stack(void *data, char *name) +{ + struct stack_trace *trace = (struct stack_trace *)data; + return trace->all_contexts ? 0 : -1; +} + +static void save_stack_address(void *data, unsigned long addr) +{ + struct stack_trace *trace = (struct stack_trace *)data; + if (trace->skip > 0) { + trace->skip--; + return; + } + if (trace->nr_entries < trace->max_entries - 1) + trace->entries[trace->nr_entries++] = addr; +} + +static struct stacktrace_ops save_stack_ops = { + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, +}; + +/* + * Save stack-backtrace addresses into a stack_trace buffer. + */ +void save_stack_trace(struct stack_trace *trace, struct task_struct *task) +{ + dump_trace(task, NULL, NULL, &save_stack_ops, trace); + trace->entries[trace->nr_entries++] = ULONG_MAX; +} +EXPORT_SYMBOL(save_stack_trace); + diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index ecbd34c1093d..91f7e678bae7 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -7,7 +7,6 @@ * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> */ -#include <linux/config.h> #include <linux/smp.h> #include <linux/suspend.h> #include <asm/proto.h> diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index 320b6fb00cca..bfbe00763c68 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -54,7 +54,7 @@ ENTRY(restore_image) movq %rcx, %cr3; movq %rax, %cr4; # turn PGE back on - movq pagedir_nosave(%rip), %rdx + movq restore_pblist(%rip), %rdx loop: testq %rdx, %rdx jz done diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c index 7c176b3edde0..213fd6ab789d 100644 --- a/arch/x86_64/kernel/syscall.c +++ b/arch/x86_64/kernel/syscall.c @@ -3,7 +3,6 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> -#include <linux/config.h> #define __NO_STUBS diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c new file mode 100644 index 000000000000..cbabfdf78e06 --- /dev/null +++ b/arch/x86_64/kernel/tce.c @@ -0,0 +1,194 @@ +/* + * This file manages the translation entries for the IBM Calgary IOMMU. + * + * Derived from arch/powerpc/platforms/pseries/iommu.c + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Jon Mason <jdmason@us.ibm.com> + * Author: Muli Ben-Yehuda <muli@il.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/bootmem.h> +#include <asm/tce.h> +#include <asm/calgary.h> +#include <asm/proto.h> + +/* flush a tce at 'tceaddr' to main memory */ +static inline void flush_tce(void* tceaddr) +{ + /* a single tce can't cross a cache line */ + if (cpu_has_clflush) + asm volatile("clflush (%0)" :: "r" (tceaddr)); + else + asm volatile("wbinvd":::"memory"); +} + +void tce_build(struct iommu_table *tbl, unsigned long index, + unsigned int npages, unsigned long uaddr, int direction) +{ + u64* tp; + u64 t; + u64 rpn; + + t = (1 << TCE_READ_SHIFT); + if (direction != DMA_TO_DEVICE) + t |= (1 << TCE_WRITE_SHIFT); + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; + t &= ~TCE_RPN_MASK; + t |= (rpn << TCE_RPN_SHIFT); + + *tp = cpu_to_be64(t); + flush_tce(tp); + + uaddr += PAGE_SIZE; + tp++; + } +} + +void tce_free(struct iommu_table *tbl, long index, unsigned int npages) +{ + u64* tp; + + tp = ((u64*)tbl->it_base) + index; + + while (npages--) { + *tp = cpu_to_be64(0); + flush_tce(tp); + tp++; + } +} + +static inline unsigned int table_size_to_number_of_entries(unsigned char size) +{ + /* + * size is the order of the table, 0-7 + * smallest table is 8K entries, so shift result by 13 to + * multiply by 8K + */ + return (1 << size) << 13; +} + +static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) +{ + unsigned int bitmapsz; + unsigned long bmppages; + int ret; + + tbl->it_busno = dev->bus->number; + + /* set the tce table size - measured in entries */ + tbl->it_size = table_size_to_number_of_entries(specified_table_size); + + /* + * number of bytes needed for the bitmap size in number of + * entries; we need one bit per entry + */ + bitmapsz = tbl->it_size / BITS_PER_BYTE; + bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); + if (!bmppages) { + printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); + ret = -ENOMEM; + goto done; + } + + tbl->it_map = (unsigned long*)bmppages; + + memset(tbl->it_map, 0, bitmapsz); + + tbl->it_hint = 0; + + spin_lock_init(&tbl->it_lock); + + return 0; + +done: + return ret; +} + +int build_tce_table(struct pci_dev *dev, void __iomem *bbar) +{ + struct iommu_table *tbl; + int ret; + + if (dev->sysdata) { + printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", + dev, dev->sysdata); + BUG(); + } + + tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); + if (!tbl) { + printk(KERN_ERR "Calgary: error allocating iommu_table\n"); + ret = -ENOMEM; + goto done; + } + + ret = tce_table_setparms(dev, tbl); + if (ret) + goto free_tbl; + + tbl->bbar = bbar; + + /* + * NUMA is already using the bus's sysdata pointer, so we use + * the bus's pci_dev's sysdata instead. + */ + dev->sysdata = tbl; + + return 0; + +free_tbl: + kfree(tbl); +done: + return ret; +} + +void* alloc_tce_table(void) +{ + unsigned int size; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + return __alloc_bootmem_low(size, size, 0); +} + +void free_tce_table(void *tbl) +{ + unsigned int size; + + if (!tbl) + return; + + size = table_size_to_number_of_entries(specified_table_size); + size *= TCE_ENTRY_SIZE; + + free_bootmem(__pa(tbl), size); +} diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7392570f975d..7ea3bf2a858c 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -8,7 +8,7 @@ * Copyright (c) 1995 Markus Kuhn * Copyright (c) 1996 Ingo Molnar * Copyright (c) 1998 Andrea Arcangeli - * Copyright (c) 2002 Vojtech Pavlik + * Copyright (c) 2002,2006 Vojtech Pavlik * Copyright (c) 2003 Andi Kleen * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ @@ -24,10 +24,13 @@ #include <linux/device.h> #include <linux/sysdev.h> #include <linux/bcd.h> +#include <linux/notifier.h> +#include <linux/cpu.h> #include <linux/kallsyms.h> #include <linux/acpi.h> #ifdef CONFIG_ACPI #include <acpi/achware.h> /* for PM timer frequency */ +#include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> #include <asm/pgtable.h> @@ -38,9 +41,7 @@ #include <asm/sections.h> #include <linux/cpufreq.h> #include <linux/hpet.h> -#ifdef CONFIG_X86_LOCAL_APIC #include <asm/apic.h> -#endif #ifdef CONFIG_CPU_FREQ static void cpufreq_delayed_get(void); @@ -48,17 +49,24 @@ static void cpufreq_delayed_get(void); extern void i8254_timer_resume(void); extern int using_apic_timer; -static char *time_init_gtod(void); +static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); int nohpet __initdata = 0; static int notsc __initdata = 0; -#undef HPET_HACK_ENABLE_DANGEROUS +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); static unsigned long hpet_period; /* fsecs / HPET clock */ unsigned long hpet_tick; /* HPET clocks / interrupt */ int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ @@ -90,7 +98,7 @@ static inline unsigned int do_gettimeoffset_tsc(void) t = get_cycles_sync(); if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; + x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; return x; } @@ -98,7 +106,7 @@ static inline unsigned int do_gettimeoffset_hpet(void) { /* cap counter read to one tick to avoid inconsistencies */ unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> 32; + return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; } unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -118,7 +126,7 @@ void do_gettimeofday(struct timeval *tv) seq = read_seqbegin(&xtime_lock); sec = xtime.tv_sec; - usec = xtime.tv_nsec / 1000; + usec = xtime.tv_nsec / NSEC_PER_USEC; /* i386 does some correction here to keep the clock monotonous even when ntpd is fixing drift. @@ -129,14 +137,14 @@ void do_gettimeofday(struct timeval *tv) in arch/x86_64/kernel/vsyscall.c and export all needed variables in vmlinux.lds. -AK */ - t = (jiffies - wall_jiffies) * (1000000L / HZ) + + t = (jiffies - wall_jiffies) * USEC_PER_TICK + do_gettimeoffset(); usec += t; } while (read_seqretry(&xtime_lock, seq)); - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + tv->tv_sec = sec + usec / USEC_PER_SEC; + tv->tv_usec = usec % USEC_PER_SEC; } EXPORT_SYMBOL(do_gettimeofday); @@ -157,8 +165,8 @@ int do_settimeofday(struct timespec *tv) write_seqlock_irq(&xtime_lock); - nsec -= do_gettimeoffset() * 1000 + - (jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); + nsec -= do_gettimeoffset() * NSEC_PER_USEC + + (jiffies - wall_jiffies) * NSEC_PER_TICK; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -179,20 +187,15 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - /* Assume the lock function has either no stack frame or only a single - word. This checks if the address on the stack looks like a kernel - text address. - There is a small window for false hits, but in that case the tick - is just accounted to the spinlock function. - Better would be to write these functions in assembler again - and check exactly. */ - if (in_lock_functions(pc)) { - char *v = *(char **)regs->rsp; - if ((v >= _stext && v <= _etext) || - (v >= _sinittext && v <= _einittext) || - (v >= (char *)MODULES_VADDR && v <= (char *)MODULES_END)) - return (unsigned long)v; - return ((unsigned long *)regs->rsp)[1]; + /* Assume the lock function has either no stack frame or a copy + of eflags from PUSHF + Eflags always has bits 22 and up cleared unlike kernel addresses. */ + if (!user_mode(regs) && in_lock_functions(pc)) { + unsigned long *sp = (unsigned long *)regs->rsp; + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; } return pc; } @@ -273,6 +276,7 @@ static void set_rtc_mmss(unsigned long nowtime) * Note: This function is required to return accurate * time even in the absence of multiple timer ticks. */ +static inline unsigned long long cycles_2_ns(unsigned long long cyc); unsigned long long monotonic_clock(void) { unsigned long seq; @@ -288,7 +292,7 @@ unsigned long long monotonic_clock(void) this_offset = hpet_readl(HPET_COUNTER); } while (read_seqretry(&xtime_lock, seq)); offset = (this_offset - last_offset); - offset *= (NSEC_PER_SEC/HZ) / hpet_tick; + offset *= NSEC_PER_TICK / hpet_tick; } else { do { seq = read_seqbegin(&xtime_lock); @@ -297,7 +301,7 @@ unsigned long long monotonic_clock(void) base = monotonic_base; } while (read_seqretry(&xtime_lock, seq)); this_offset = get_cycles_sync(); - offset = (this_offset - last_offset)*1000 / cpu_khz; + offset = cycles_2_ns(this_offset - last_offset); } return base + offset; } @@ -382,7 +386,7 @@ void main_timer_handler(struct pt_regs *regs) } monotonic_base += - (offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; + (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; vxtime.last = offset; #ifdef CONFIG_X86_PM_TIMER @@ -391,36 +395,36 @@ void main_timer_handler(struct pt_regs *regs) #endif } else { offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); + vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; if (offset < 0) offset = 0; - if (offset > (USEC_PER_SEC / HZ)) { - lost = offset / (USEC_PER_SEC / HZ); - offset %= (USEC_PER_SEC / HZ); + if (offset > USEC_PER_TICK) { + lost = offset / USEC_PER_TICK; + offset %= USEC_PER_TICK; } - monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; + monotonic_base += cycles_2_ns(tsc - vxtime.last_tsc); vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> 32) < offset) + vxtime.tsc_quot) >> US_SCALE) < offset) vxtime.last_tsc = tsc - - (((long) offset << 32) / vxtime.tsc_quot) - 1; + (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; } - if (lost > 0) { + if (lost > 0) handle_lost_ticks(lost, regs); - jiffies += lost; - } + else + lost = 0; /* * Do the timer stuff. */ - do_timer(regs); + do_timer(lost + 1); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); #endif @@ -431,12 +435,8 @@ void main_timer_handler(struct pt_regs *regs) * have to call the local interrupt handler. */ -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else if (!using_apic_timer) smp_local_timer_interrupt(regs); -#endif /* * If we have an externally synchronized Linux clock, then update CMOS clock @@ -460,24 +460,21 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) if (apic_runs_main_timer > 1) return IRQ_HANDLED; main_timer_handler(regs); -#ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(); -#endif return IRQ_HANDLED; } static unsigned int cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; } static inline unsigned long long cycles_2_ns(unsigned long long cyc) { - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; + return (cyc * cyc2ns_scale) >> NS_SCALE; } unsigned long long sched_clock(void) @@ -490,7 +487,7 @@ unsigned long long sched_clock(void) Disadvantage is a small drift between CPUs in some configurations, but that should be tolerable. */ if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; + return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; #endif /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, @@ -633,7 +630,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; } set_cyc2ns_scale(cpu_khz_ref); @@ -789,8 +786,8 @@ static int hpet_timer_stop_set_go(unsigned long tick) if (hpet_use_timer) { hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); - hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ cfg |= HPET_CFG_LEGACY; } /* @@ -825,8 +822,7 @@ static int hpet_init(void) if (hpet_period < 100000 || hpet_period > 100000000) return -1; - hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / - hpet_period; + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; hpet_use_timer = (id & HPET_ID_LEGSUP); @@ -882,26 +878,20 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; +static int __cpuinit +time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned cpu = (unsigned long) hcpu; + if (action == CPU_ONLINE) + vsyscall_set_cpu(cpu); + return NOTIFY_DONE; +} + void __init time_init(void) { - char *timename; - char *gtod; - -#ifdef HPET_HACK_ENABLE_DANGEROUS - if (!vxtime.hpet_address) { - printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " - "manually!\n"); - outl(0x800038a0, 0xcf8); - outl(0xff000001, 0xcfc); - outl(0x800038a0, 0xcf8); - vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; - printk(KERN_WARNING "time.c: WARNING: Enabled HPET " - "at %#lx.\n", vxtime.hpet_address); - } -#endif if (nohpet) vxtime.hpet_address = 0; @@ -912,7 +902,7 @@ void __init time_init(void) -xtime.tv_sec, -xtime.tv_nsec); if (!hpet_init()) - vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; + vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; else vxtime.hpet_address = 0; @@ -935,18 +925,17 @@ void __init time_init(void) } vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); - - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); - printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; vxtime.last_tsc = get_cycles_sync(); + set_cyc2ns_scale(cpu_khz); setup_irq(0, &irq0); + hotcpu_notifier(time_cpu_notifier, 0); + time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); - set_cyc2ns_scale(cpu_khz); +#ifndef CONFIG_SMP + time_init_gtod(); +#endif } /* @@ -956,13 +945,20 @@ void __init time_init(void) __cpuinit int unsynchronized_tsc(void) { #ifdef CONFIG_SMP - if (oem_force_hpet_timer()) + if (apic_is_clustered_box()) return 1; - /* Intel systems are normally all synchronized. Exceptions - are handled in the OEM check above. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - return 0; #endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; } @@ -970,12 +966,18 @@ __cpuinit int unsynchronized_tsc(void) /* * Decide what mode gettimeofday should use. */ -__init static char *time_init_gtod(void) +void time_init_gtod(void) { char *timetype; if (unsynchronized_tsc()) notsc = 1; + + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; + if (vxtime.hpet_address && notsc) { timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; if (hpet_use_timer) @@ -998,7 +1000,16 @@ __init static char *time_init_gtod(void) timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; vxtime.mode = VXTIME_TSC; } - return timetype; + + printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", + vxtime_hz / 1000000, vxtime_hz % 1000000, timename, timetype); + printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); + vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; + vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; + vxtime.last_tsc = get_cycles_sync(); + + set_cyc2ns_scale(cpu_khz); } __setup("report_lost_ticks", time_setup); @@ -1028,8 +1039,16 @@ static int timer_resume(struct sys_device *dev) unsigned long flags; unsigned long sec; unsigned long ctime = get_cmos_time(); - unsigned long sleep_length = (ctime - sleep_start) * HZ; + long sleep_length = (ctime - sleep_start) * HZ; + if (sleep_length < 0) { + printk(KERN_WARNING "Time skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } if (vxtime.hpet_address) hpet_reenable(); else @@ -1145,23 +1164,25 @@ int hpet_rtc_timer_init(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); hpet_t1_cmp = cnt; - local_irq_restore(flags); cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); + local_irq_restore(flags); + return 1; } static void hpet_rtc_timer_reinit(void) { - unsigned int cfg, cnt; + unsigned int cfg, cnt, ticks_per_int, lost_ints; if (unlikely(!(PIE_on | AIE_on | UIE_on))) { cfg = hpet_readl(HPET_T1_CFG); @@ -1176,10 +1197,33 @@ static void hpet_rtc_timer_reinit(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_t1_cmp; - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; + + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } } /* diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 23a03eb91fc7..c79b99a9e2f6 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -64,7 +64,7 @@ idt_48: .word 0, 0 # idt base = 0L gdt_48: - .short __KERNEL32_CS + 7 # gdt limit + .short GDT_ENTRIES*8 - 1 # gdt limit .long cpu_gdt_table-__START_KERNEL_map .globl trampoline_end diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index cea335e8746c..01f2a8d254c2 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -6,15 +6,12 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ */ /* * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'entry.S'. */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/string.h> @@ -31,6 +28,7 @@ #include <linux/nmi.h> #include <linux/kprobes.h> #include <linux/kexec.h> +#include <linux/unwind.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -41,12 +39,13 @@ #include <asm/i387.h> #include <asm/kdebug.h> #include <asm/processor.h> - +#include <asm/unwind.h> #include <asm/smp.h> #include <asm/pgalloc.h> #include <asm/pda.h> #include <asm/proto.h> #include <asm/nmi.h> +#include <asm/stacktrace.h> asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -71,19 +70,20 @@ asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); ATOMIC_NOTIFIER_HEAD(die_chain); +EXPORT_SYMBOL(die_chain); int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_all(); return atomic_notifier_chain_register(&die_chain, nb); } -EXPORT_SYMBOL(register_die_notifier); +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&die_chain, nb); } -EXPORT_SYMBOL(unregister_die_notifier); +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ static inline void conditional_sti(struct pt_regs *regs) { @@ -107,35 +107,43 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_enable_no_resched(); } -static int kstack_depth_to_print = 10; +static int kstack_depth_to_print = 12; +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif #ifdef CONFIG_KALLSYMS -#include <linux/kallsyms.h> -int printk_address(unsigned long address) -{ +# include <linux/kallsyms.h> +void printk_address(unsigned long address) +{ unsigned long offset = 0, symsize; const char *symname; char *modname; - char *delim = ":"; + char *delim = ":"; char namebuf[128]; - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%016lx>]\n", address); + return; + } + if (!modname) modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address, delim, modname, delim, symname, offset); -} + printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", + address, delim, modname, delim, symname, offset, symsize); +} #else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} +void printk_address(unsigned long address) +{ + printk(" [<%016lx>]\n", address); +} #endif static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, const char **idp) + unsigned *usedp, char **idp) { static char ids[][8] = { [DEBUG_STACK - 1] = "#DB", @@ -149,32 +157,49 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, }; unsigned k; + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end; - - switch (k + 1) { -#if DEBUG_STKSZ > EXCEPTION_STKSZ - case DEBUG_STACK: - end = cpu_pda(cpu)->debugstack + DEBUG_STKSZ; - break; -#endif - default: - end = per_cpu(init_tss, cpu).ist[k]; - break; - } + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ if (stack >= end) continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ if (*usedp & (1U << k)) break; *usedp |= 1U << k; *idp = ids[k]; return (unsigned long *)end; } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ #if DEBUG_STKSZ > EXCEPTION_STKSZ if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { unsigned j = N_EXCEPTION_STACKS - 1; + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ do { ++j; end -= EXCEPTION_STKSZ; @@ -191,6 +216,25 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) +{ + struct ops_and_data *oad = (struct ops_and_data *)context; + int n = 0; + + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + oad->ops->address(oad->data, UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + } + return n; +} + /* * x86-64 can have upto three kernel stacks: * process stack @@ -198,25 +242,66 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void show_trace(unsigned long *stack) +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, + struct stacktrace_ops *ops, void *data) { - const unsigned cpu = safe_smp_processor_id(); + const unsigned cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - int i; unsigned used = 0; - printk("\nCall Trace:"); + if (!tsk) + tsk = current; + + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; + + if (regs) { + if (unwind_init_frame_info(&info, tsk, regs) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } else if (tsk == current) + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + else { + if (unwind_init_blocked(&info, tsk) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if ((long)UNW_SP(&info) < 0) { + ops->warning(data, "Leftover inexact backtrace:\n"); + stack = (unsigned long *)UNW_SP(&info); + if (!stack) + return; + } else + ops->warning(data, "Full inexact backtrace again:\n"); + } else if (call_trace >= 1) + return; + else + ops->warning(data, "Full inexact backtrace again:\n"); + } else + ops->warning(data, "Inexact backtrace:\n"); + } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (tsk && tsk != current) + stack = (unsigned long *)tsk->thread.rsp; + } + /* + * Print function call entries within a stack. 'cond' is the + * "end of stackframe" condition, that the 'stack++' + * iteration will eventually trigger. + */ #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (kernel_text_address(addr)) { \ - if (i > 50) { \ - printk("\n "); \ - i = 0; \ - } \ - else \ - i += printk(" "); \ + if (oops_in_progress ? \ + __kernel_text_address(addr) : \ + kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -225,20 +310,31 @@ void show_trace(unsigned long *stack) * down the cause of the crash will be able to figure \ * out the call path that was taken. \ */ \ - i += printk_address(addr); \ + ops->address(data, addr); \ } \ } while (0) - for(i = 11; ; ) { - const char *id; + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + for (;;) { + char *id; unsigned long *estack_end; estack_end = in_exception_stack(cpu, (unsigned long)stack, &used, &id); if (estack_end) { - i += printk(" <%s>", id); + if (ops->stack(data, id) < 0) + break; HANDLE_STACK (stack < estack_end); - i += printk(" <EOE>"); + ops->stack(data, "<EOE>"); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ stack = (unsigned long *) estack_end[-2]; continue; } @@ -248,27 +344,75 @@ void show_trace(unsigned long *stack) (IRQSTACKSIZE - 64) / sizeof(*irqstack); if (stack >= irqstack && stack < irqstack_end) { - i += printk(" <IRQ>"); + if (ops->stack(data, "IRQ") < 0) + break; HANDLE_STACK (stack < irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ stack = (unsigned long *) (irqstack_end[-1]); irqstack_end = NULL; - i += printk(" <EOI>"); + ops->stack(data, "EOI"); continue; } } break; } + /* + * This handles the process stack: + */ HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); #undef HANDLE_STACK +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); printk("\n"); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr) +{ + printk_address(addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +void +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +{ + printk("\nCall Trace:\n"); + dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + printk("\n"); +} + +static void +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) { unsigned long *stack; int i; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -294,11 +438,16 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) break; } if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); + printk("\n"); + printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace((unsigned long *)rsp); + show_trace(tsk, regs, rsp); +} + +void show_stack(struct task_struct *tsk, unsigned long * rsp) +{ + _show_stack(tsk, NULL, rsp); } /* @@ -307,7 +456,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) void dump_stack(void) { unsigned long dummy; - show_trace(&dummy); + show_trace(NULL, NULL, &dummy); } EXPORT_SYMBOL(dump_stack); @@ -317,7 +466,7 @@ void show_registers(struct pt_regs *regs) int i; int in_kernel = !user_mode(regs); unsigned long rsp; - const int cpu = safe_smp_processor_id(); + const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; rsp = regs->rsp; @@ -334,7 +483,7 @@ void show_registers(struct pt_regs *regs) if (in_kernel) { printk("Stack: "); - show_stack(NULL, (unsigned long*)rsp); + _show_stack(NULL, regs, (unsigned long*)rsp); printk("\nCode: "); if (regs->rip < PAGE_OFFSET) @@ -383,6 +532,7 @@ void out_of_line_bug(void) { BUG(); } +EXPORT_SYMBOL(out_of_line_bug); #endif static DEFINE_SPINLOCK(die_lock); @@ -391,9 +541,11 @@ static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = safe_smp_processor_id(); + int cpu = smp_processor_id(); unsigned long flags; + oops_enter(); + /* racy, but better than risking deadlock. */ local_irq_save(flags); if (!spin_trylock(&die_lock)) { @@ -421,7 +573,8 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); + panic("Fatal exception"); + oops_exit(); } void __kprobes __die(const char * str, struct pt_regs * regs, long err) @@ -458,7 +611,7 @@ void die(const char * str, struct pt_regs * regs, long err) do_exit(SIGSEGV); } -void __kprobes die_nmi(char *str, struct pt_regs *regs) +void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) { unsigned long flags = oops_begin(); @@ -466,13 +619,12 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs) * We are in trouble anyway, lets at least try * to get a message out. */ - printk(str, safe_smp_processor_id()); + printk(str, smp_processor_id()); show_registers(regs); if (kexec_should_crash(current)) crash_kexec(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); oops_end(flags); nmi_exit(); local_irq_enable(); @@ -618,8 +770,15 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, static __kprobes void mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "You probably have a hardware problem with your " + "RAM chips\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ reason = (reason & 0xf) | 4; @@ -642,9 +801,15 @@ io_check_error(unsigned char reason, struct pt_regs * regs) static __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); +{ + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", + reason); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } /* Runs on IST stack. This code must keep interrupts off all the time. @@ -664,17 +829,15 @@ asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; -#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); + if (nmi_watchdog_tick(regs,reason)) return; - } -#endif - unknown_nmi_error(reason, regs); + if (!do_nmi_callback(regs,cpu)) + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -959,6 +1122,7 @@ asmlinkage void math_state_restore(void) init_fpu(me); restore_fpu_checking(&me->thread.i387.fxsave); task_thread_info(me)->status |= TS_USEDFPU; + me->fpu_counter++; } void __init trap_init(void) @@ -997,18 +1161,39 @@ void __init trap_init(void) } -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) +static int __init oops_setup(char *s) { - panic_on_oops = 1; - return 1; + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; } -__setup("oops=", oops_dummy); +early_param("oops", oops_setup); static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (!s) + return -EINVAL; + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; + return 0; +} +early_param("call_trace", call_trace_setup); +#endif diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index b81f473c4a19..f8aeccf105fa 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -6,7 +6,6 @@ #include <asm-generic/vmlinux.lds.h> #include <asm/page.h> -#include <linux/config.h> #undef i386 /* in case the preprocessor is a 32bit one */ @@ -14,6 +13,12 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __START_KERNEL; @@ -32,7 +37,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 /* out-of-line lock text */ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } @@ -45,21 +50,23 @@ SECTIONS RODATA +#ifdef CONFIG_STACK_UNWIND + . = ALIGN(8); + .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { + __start_unwind = .; + *(.eh_frame) + __end_unwind = .; + } +#endif + /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) CONSTRUCTORS - } + } :data _edata = .; /* End of data section */ - __bss_start = .; /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - *(.bss.page_aligned) - *(.bss) - } - __bss_stop = .; - . = ALIGN(PAGE_SIZE); . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { @@ -81,7 +88,7 @@ SECTIONS #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); @@ -91,6 +98,9 @@ SECTIONS .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } vxtime = VVIRT(.vxtime); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } wall_jiffies = VVIRT(.wall_jiffies); @@ -124,13 +134,33 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) - } + } :data . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { *(.data.page_aligned) } + /* might get freed after init */ + . = ALIGN(4096); + __smp_alt_begin = .; + __smp_alt_instructions = .; + .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { + *(.smp_altinstructions) + } + __smp_alt_instructions_end = .; + . = ALIGN(8); + __smp_locks = .; + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + *(.smp_locks) + } + __smp_locks_end = .; + .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { + *(.smp_altinstr_replacement) + } + . = ALIGN(4096); + __smp_alt_end = .; + . = ALIGN(4096); /* Init code and data */ __init_begin = .; .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { @@ -179,14 +209,12 @@ SECTIONS __initramfs_start = .; .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; - /* temporary here to work around NR_CPUS. If you see this comment in 2.6.17+ - complain */ - . = ALIGN(4096); - __init_end = .; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; + . = ALIGN(4096); + __init_end = .; . = ALIGN(4096); __nosave_begin = .; @@ -194,6 +222,13 @@ SECTIONS . = ALIGN(4096); __nosave_end = .; + __bss_start = .; /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + *(.bss.page_aligned) + *(.bss) + } + __bss_stop = .; + _end = . ; /* Sections to be discarded */ diff --git a/arch/x86_64/kernel/vsmp.c b/arch/x86_64/kernel/vsmp.c index 92f70c74965f..044e852bd25e 100644 --- a/arch/x86_64/kernel/vsmp.c +++ b/arch/x86_64/kernel/vsmp.c @@ -20,6 +20,9 @@ static int __init vsmp_init(void) void *address; unsigned int cap, ctl; + if (!early_pci_allowed()) + return 0; + /* Check if we are running on a ScaleMP vSMP box */ if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 9468fb20b0bc..07c086382059 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -26,6 +26,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/getcpu.h> #include <asm/vsyscall.h> #include <asm/pgtable.h> @@ -33,11 +34,15 @@ #include <asm/fixmap.h> #include <asm/errno.h> #include <asm/io.h> +#include <asm/segment.h> +#include <asm/desc.h> +#include <asm/topology.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +int __vgetcpu_mode __section_vgetcpu_mode; #include <asm/unistd.h> @@ -72,7 +77,8 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) __vxtime.tsc_quot) >> 32; /* See comment in x86_64 do_gettimeofday. */ } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - + usec += ((readl((void __iomem *) + fix_to_virt(VSYSCALL_HPET) + 0xf0) - __vxtime.last) * __vxtime.quot) >> 32; } } while (read_seqretry(&__xtime_lock, sequence)); @@ -107,7 +113,7 @@ static __always_inline long time_syscall(long *t) int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (unlikely(!__sysctl_vsyscall)) + if (!__sysctl_vsyscall) return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); @@ -120,16 +126,53 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (unlikely(!__sysctl_vsyscall)) + if (!__sysctl_vsyscall) return time_syscall(t); else if (t) *t = __xtime.tv_sec; return __xtime.tv_sec; } -long __vsyscall(2) venosys_0(void) +/* Fast way to get current CPU and node. + This helps to do per node and per CPU caches in user space. + The result is not guaranteed without CPU affinity, but usually + works out because the scheduler tries to keep a thread on the same + CPU. + + tcache must point to a two element sized long array. + All arguments can be NULL. */ +long __vsyscall(2) +vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - return -ENOSYS; + unsigned int dummy, p; + unsigned long j = 0; + + /* Fast cache - only recompute value once per jiffies and avoid + relatively costly rdtscp/cpuid otherwise. + This works because the scheduler usually keeps the process + on the same CPU and this syscall doesn't guarantee its + results anyways. + We do this here because otherwise user space would do it on + its own in a likely inferior way (no access to jiffies). + If you don't like it pass NULL. */ + if (tcache && tcache->blob[0] == (j = __jiffies)) { + p = tcache->blob[1]; + } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + /* Load per CPU data from RDTSCP */ + rdtscp(dummy, dummy, p); + } else { + /* Load per CPU data from GDT */ + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + } + if (tcache) { + tcache->blob[0] = j; + tcache->blob[1] = p; + } + if (cpu) + *cpu = p & 0xfff; + if (node) + *node = p >> 12; + return 0; } long __vsyscall(3) venosys_1(void) @@ -149,7 +192,8 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { extern u16 vsysc1, vsysc2; - u16 *map1, *map2; + u16 __iomem *map1; + u16 __iomem *map2; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); if (!write) return ret; @@ -164,11 +208,11 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, goto out; } if (!sysctl_vsyscall) { - *map1 = SYSCALL; - *map2 = SYSCALL; + writew(SYSCALL, map1); + writew(SYSCALL, map2); } else { - *map1 = NOP2; - *map2 = NOP2; + writew(NOP2, map1); + writew(NOP2, map2); } iounmap(map2); out: @@ -200,6 +244,43 @@ static ctl_table kernel_root_table2[] = { #endif +static void __cpuinit write_rdtscp_cb(void *info) +{ + write_rdtscp_aux((unsigned long)info); +} + +void __cpuinit vsyscall_set_cpu(int cpu) +{ + unsigned long *d; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node[cpu]; +#endif + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { + void *info = (void *)((node << 12) | cpu); + /* Can happen on preemptive kernel */ + if (get_cpu() == cpu) + write_rdtscp_cb(info); +#ifdef CONFIG_SMP + else { + /* the notifier is unfortunately not executed on the + target CPU */ + smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); + } +#endif + put_cpu(); + } + + /* Store cpu number in limit so that it can be loaded quickly + in user space in vgetcpu. + 12 bits for the CPU and 8 bits for the node. */ + d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + *d = 0x0f40000000000ULL; + *d |= cpu; + *d |= (node & 0xf) << 12; + *d |= (node >> 4) << 48; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; @@ -214,6 +295,7 @@ static int __init vsyscall_init(void) VSYSCALL_ADDR(__NR_vgettimeofday))); BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); + BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 1def21c9f7cd..c3454af5e3a2 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -1,66 +1,21 @@ +/* Exports for assembly files. + All C exports should go in the respective C files. */ + #include <linux/config.h> #include <linux/module.h> #include <linux/smp.h> -#include <linux/user.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/tty.h> #include <asm/semaphore.h> #include <asm/processor.h> -#include <asm/i387.h> #include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> #include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/kdebug.h> -#include <asm/unistd.h> -#include <asm/tlbflush.h> -#include <asm/kdebug.h> - -extern spinlock_t rtc_lock; -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -#endif - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -//EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(ip_compute_csum); -/* Delay loops */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); @@ -71,42 +26,21 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(copy_user_generic); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); -EXPORT_SYMBOL(strnlen_user); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif +EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -EXPORT_SYMBOL(_cpu_pda); #ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); - -EXPORT_SYMBOL(smp_call_function); -EXPORT_SYMBOL(cpu_callout_map); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); #endif -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - /* Export string functions. We normally rely on gcc builtin for most of these, but gcc sometimes decides not to inline them. */ #undef memcpy @@ -114,51 +48,14 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); #undef memmove extern void * memset(void *,int,__kernel_size_t); -extern size_t strlen(const char *); -extern void * memmove(void * dest,const void *src,size_t count); extern void * memcpy(void *,const void *,__kernel_size_t); extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM -/* prototypes are wrong, these are assembly with custom calling functions */ -extern void rwsem_down_read_failed_thunk(void); -extern void rwsem_wake_thunk(void); -extern void rwsem_downgrade_thunk(void); -extern void rwsem_down_write_failed_thunk(void); -EXPORT_SYMBOL(rwsem_down_read_failed_thunk); -EXPORT_SYMBOL(rwsem_wake_thunk); -EXPORT_SYMBOL(rwsem_downgrade_thunk); -EXPORT_SYMBOL(rwsem_down_write_failed_thunk); -#endif - EXPORT_SYMBOL(empty_zero_page); - -EXPORT_SYMBOL(die_chain); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_sibling_map); -EXPORT_SYMBOL(smp_num_siblings); -#endif - -#ifdef CONFIG_BUG -EXPORT_SYMBOL(out_of_line_bug); -#endif - EXPORT_SYMBOL(init_level4_pgt); - -extern unsigned long __supported_pte_mask; -EXPORT_SYMBOL(__supported_pte_mask); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(flush_tlb_page); -#endif - -EXPORT_SYMBOL(cpu_khz); - EXPORT_SYMBOL(load_gs_index); diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index ccef6ae747a3..b78d4170fce2 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ usercopy.o getuser.o putuser.o \ thunk.o clear_page.o copy_page.o bitstr.o bitops.o -lib-y += memcpy.o memmove.o memset.o copy_user.o +lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S index 1f81b79b796c..9a10a78bb4a4 100644 --- a/arch/x86_64/lib/clear_page.S +++ b/arch/x86_64/lib/clear_page.S @@ -1,10 +1,22 @@ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * Zero a page. * rdi page */ - .globl clear_page - .p2align 4 -clear_page: + ALIGN +clear_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + xorl %eax,%eax + rep stosq + ret + CFI_ENDPROC +ENDPROC(clear_page) + +ENTRY(clear_page) + CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -23,28 +35,25 @@ clear_page: jnz .Lloop nop ret -clear_page_end: + CFI_ENDPROC +.Lclear_page_end: +ENDPROC(clear_page) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad clear_page - .quad clear_page_c - .byte X86_FEATURE_REP_GOOD - .byte clear_page_end-clear_page - .byte clear_page_c_end-clear_page_c - .previous - - .section .altinstr_replacement,"ax" -clear_page_c: - movl $4096/8,%ecx - xorl %eax,%eax - rep - stosq - ret -clear_page_c_end: + .quad clear_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lclear_page_end - clear_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S index 8fa19d96a7ee..0ebb03b60e79 100644 --- a/arch/x86_64/lib/copy_page.S +++ b/arch/x86_64/lib/copy_page.S @@ -1,17 +1,33 @@ /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */ +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + + ALIGN +copy_page_c: + CFI_STARTPROC + movl $4096/8,%ecx + rep movsq + ret + CFI_ENDPROC +ENDPROC(copy_page_c) + /* Don't use streaming store because it's better when the target ends up in cache. */ /* Could vary the prefetch distance based on SMP/UP */ - .globl copy_page - .p2align 4 -copy_page: +ENTRY(copy_page) + CFI_STARTPROC subq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET 3*8 movq %rbx,(%rsp) + CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) + CFI_REL_OFFSET r12, 1*8 movq %r13,2*8(%rsp) + CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -72,30 +88,33 @@ copy_page: jnz .Loop2 movq (%rsp),%rbx + CFI_RESTORE rbx movq 1*8(%rsp),%r12 + CFI_RESTORE r12 movq 2*8(%rsp),%r13 + CFI_RESTORE r13 addq $3*8,%rsp + CFI_ADJUST_CFA_OFFSET -3*8 ret +.Lcopy_page_end: + CFI_ENDPROC +ENDPROC(copy_page) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad copy_page - .quad copy_page_c - .byte X86_FEATURE_REP_GOOD - .byte copy_page_c_end-copy_page_c - .byte copy_page_c_end-copy_page_c - .previous - - .section .altinstr_replacement,"ax" -copy_page_c: - movl $4096/8,%ecx - rep - movsq - ret -copy_page_c_end: + .quad copy_page + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lcopy_page_end - copy_page + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index f64569b83b54..70bebd310408 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S @@ -4,56 +4,78 @@ * Functions to copy from and to user space. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + #define FIX_ALIGNMENT 1 - #include <asm/current.h> - #include <asm/asm-offsets.h> - #include <asm/thread_info.h> - #include <asm/cpufeature.h> +#include <asm/current.h> +#include <asm/asm-offsets.h> +#include <asm/thread_info.h> +#include <asm/cpufeature.h> -/* Standard copy_to_user with segment limit checking */ - .globl copy_to_user - .p2align 4 -copy_to_user: - GET_THREAD_INFO(%rax) - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq threadinfo_addr_limit(%rax),%rcx - jae bad_to_user -2: + .macro ALTERNATIVE_JUMP feature,orig,alt +0: .byte 0xe9 /* 32bit jump */ - .long .Lcug-1f + .long \orig-1f /* by default jump to orig */ 1: - .section .altinstr_replacement,"ax" -3: .byte 0xe9 /* replacement jmp with 8 bit immediate */ - .long copy_user_generic_c-1b /* offset */ +2: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt-1b /* offset */ /* or alternatively to alt */ .previous .section .altinstructions,"a" .align 8 + .quad 0b .quad 2b - .quad 3b - .byte X86_FEATURE_REP_GOOD + .byte \feature /* when feature is set */ .byte 5 .byte 5 .previous + .endm + +/* Standard copy_to_user with segment limit checking */ +ENTRY(copy_to_user) + CFI_STARTPROC + GET_THREAD_INFO(%rax) + movq %rdi,%rcx + addq %rdx,%rcx + jc bad_to_user + cmpq threadinfo_addr_limit(%rax),%rcx + jae bad_to_user + xorl %eax,%eax /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(copy_user_generic) + CFI_STARTPROC + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC + +ENTRY(__copy_from_user_inatomic) + CFI_STARTPROC + xorl %ecx,%ecx /* clear zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC /* Standard copy_from_user with segment limit checking */ - .globl copy_from_user - .p2align 4 -copy_from_user: +ENTRY(copy_from_user) + CFI_STARTPROC GET_THREAD_INFO(%rax) movq %rsi,%rcx addq %rdx,%rcx jc bad_from_user cmpq threadinfo_addr_limit(%rax),%rcx jae bad_from_user - /* FALL THROUGH to copy_user_generic */ + movl $1,%ecx /* set zero flag */ + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + CFI_ENDPROC +ENDPROC(copy_from_user) .section .fixup,"ax" /* must zero dest */ bad_from_user: + CFI_STARTPROC movl %edx,%ecx xorl %eax,%eax rep @@ -61,40 +83,32 @@ bad_from_user: bad_to_user: movl %edx,%eax ret + CFI_ENDPROC +END(bad_from_user) .previous /* - * copy_user_generic - memory copy with exception handling. + * copy_user_generic_unrolled - memory copy with exception handling. + * This version is for CPUs like P4 that don't have efficient micro code for rep movsq * * Input: * rdi destination * rsi source * rdx count + * ecx zero flag -- if true zero destination on error * * Output: * eax uncopied bytes or 0 if successful. */ - .globl copy_user_generic - .p2align 4 -copy_user_generic: - .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */ - .byte 0x66,0x90 -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long copy_user_generic_c-1b /* offset */ - .previous - .section .altinstructions,"a" - .align 8 - .quad copy_user_generic - .quad 2b - .byte X86_FEATURE_REP_GOOD - .byte 5 - .byte 5 - .previous -.Lcug: +ENTRY(copy_user_generic_unrolled) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rcx, 0 xorl %eax,%eax /*zero for the exception handler */ #ifdef FIX_ALIGNMENT @@ -168,9 +182,16 @@ copy_user_generic: decl %ecx jnz .Lloop_1 + CFI_REMEMBER_STATE .Lende: + popq %rcx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rcx popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret + CFI_RESTORE_STATE #ifdef FIX_ALIGNMENT /* align destination */ @@ -252,6 +273,8 @@ copy_user_generic: addl %ecx,%edx /* edx: bytes to zero, rdi: dest, eax:zero */ .Lzero_rest: + cmpl $0,(%rsp) + jz .Le_zero movq %rdx,%rcx .Le_byte: xorl %eax,%eax @@ -261,6 +284,9 @@ copy_user_generic: .Le_zero: movq %rdx,%rax jmp .Lende + CFI_ENDPROC +ENDPROC(copy_user_generic) + /* Some CPUs run faster using the string copy instructions. This is also a lot simpler. Use them when possible. @@ -270,6 +296,7 @@ copy_user_generic: /* rdi destination * rsi source * rdx count + * ecx zero flag * * Output: * eax uncopied bytes or 0 if successfull. @@ -280,22 +307,48 @@ copy_user_generic: * And more would be dangerous because both Intel and AMD have * errata with rep movsq > 4GB. If someone feels the need to fix * this please consider this. - */ -copy_user_generic_c: + */ +ENTRY(copy_user_generic_string) + CFI_STARTPROC + movl %ecx,%r8d /* save zero flag */ movl %edx,%ecx shrl $3,%ecx andl $7,%edx + jz 10f 1: rep movsq movl %edx,%ecx 2: rep movsb -4: movl %ecx,%eax +9: movl %ecx,%eax ret -3: lea (%rdx,%rcx,8),%rax + + /* multiple of 8 byte */ +10: rep + movsq + xor %eax,%eax ret + /* exception handling */ +3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */ + jmp 6f +5: movl %ecx,%eax /* exception on byte loop */ + /* eax: left over bytes */ +6: testl %r8d,%r8d /* zero flag set? */ + jz 7f + movl %eax,%ecx /* initialize x86 loop counter */ + push %rax + xorl %eax,%eax +8: rep + stosb /* zero the rest */ +11: pop %rax +7: ret + CFI_ENDPROC +END(copy_user_generic_c) + .section __ex_table,"a" .quad 1b,3b - .quad 2b,4b + .quad 2b,5b + .quad 8b,11b + .quad 10b,3b .previous diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S index 72fd55ee896e..f0dba36578ea 100644 --- a/arch/x86_64/lib/csum-copy.S +++ b/arch/x86_64/lib/csum-copy.S @@ -5,8 +5,9 @@ * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. */ - #include <linux/linkage.h> - #include <asm/errno.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/errno.h> /* * Checksum copy with exception handling. @@ -53,19 +54,24 @@ .endm - .globl csum_partial_copy_generic - .p2align 4 -csum_partial_copy_generic: +ENTRY(csum_partial_copy_generic) + CFI_STARTPROC cmpl $3*64,%edx jle .Lignore .Lignore: subq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET 7*8 movq %rbx,2*8(%rsp) + CFI_REL_OFFSET rbx, 2*8 movq %r12,3*8(%rsp) + CFI_REL_OFFSET r12, 3*8 movq %r14,4*8(%rsp) + CFI_REL_OFFSET r14, 4*8 movq %r13,5*8(%rsp) + CFI_REL_OFFSET r13, 5*8 movq %rbp,6*8(%rsp) + CFI_REL_OFFSET rbp, 6*8 movq %r8,(%rsp) movq %r9,1*8(%rsp) @@ -208,14 +214,22 @@ csum_partial_copy_generic: addl %ebx,%eax adcl %r9d,%eax /* carry */ + CFI_REMEMBER_STATE .Lende: movq 2*8(%rsp),%rbx + CFI_RESTORE rbx movq 3*8(%rsp),%r12 + CFI_RESTORE r12 movq 4*8(%rsp),%r14 + CFI_RESTORE r14 movq 5*8(%rsp),%r13 + CFI_RESTORE r13 movq 6*8(%rsp),%rbp + CFI_RESTORE rbp addq $7*8,%rsp + CFI_ADJUST_CFA_OFFSET -7*8 ret + CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: @@ -231,3 +245,5 @@ csum_partial_copy_generic: jz .Lende movl $-EFAULT,(%rax) jmp .Lende + CFI_ENDPROC +ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c index 5384e227cdf6..c493735218da 100644 --- a/arch/x86_64/lib/csum-partial.c +++ b/arch/x86_64/lib/csum-partial.c @@ -147,4 +147,5 @@ unsigned short ip_compute_csum(unsigned char * buff, int len) { return csum_fold(csum_partial(buff,len,0)); } +EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c index 94323f20816e..b1320ec58428 100644 --- a/arch/x86_64/lib/csum-wrappers.c +++ b/arch/x86_64/lib/csum-wrappers.c @@ -109,6 +109,7 @@ csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, { return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); } +EXPORT_SYMBOL(csum_partial_copy_nocheck); unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr, __u32 len, unsigned short proto, unsigned int sum) diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 03c460cbdd1c..b6cd3cca2f45 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -9,6 +9,7 @@ */ #include <linux/config.h> +#include <linux/module.h> #include <linux/sched.h> #include <linux/delay.h> #include <asm/delay.h> @@ -36,18 +37,22 @@ void __delay(unsigned long loops) } while((now-bclock) < loops); } +EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32); } +EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ } +EXPORT_SYMBOL(__udelay); void __ndelay(unsigned long nsecs) { __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ } +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S index 3844d5e885a4..5448876261f8 100644 --- a/arch/x86_64/lib/getuser.S +++ b/arch/x86_64/lib/getuser.S @@ -27,25 +27,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __get_user_1 -__get_user_1: +ENTRY(__get_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_get_user 1: movzb (%rcx),%edx xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__get_user_1) - .p2align 4 -.globl __get_user_2 -__get_user_2: +ENTRY(__get_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -57,10 +58,11 @@ __get_user_2: ret 20: decq %rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_2) - .p2align 4 -.globl __get_user_4 -__get_user_4: +ENTRY(__get_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -72,10 +74,11 @@ __get_user_4: ret 30: subq $3,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_4) - .p2align 4 -.globl __get_user_8 -__get_user_8: +ENTRY(__get_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -87,11 +90,16 @@ __get_user_8: ret 40: subq $7,%rcx jmp bad_get_user + CFI_ENDPROC +ENDPROC(__get_user_8) bad_get_user: + CFI_STARTPROC xorl %edx,%edx movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_get_user) .section __ex_table,"a" .quad 1b,bad_get_user diff --git a/arch/x86_64/lib/iomap_copy.S b/arch/x86_64/lib/iomap_copy.S index 8bbade5fea05..05a95e713da8 100644 --- a/arch/x86_64/lib/iomap_copy.S +++ b/arch/x86_64/lib/iomap_copy.S @@ -15,12 +15,16 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * override generic version in lib/iomap_copy.c */ - .globl __iowrite32_copy - .p2align 4 -__iowrite32_copy: +ENTRY(__iowrite32_copy) + CFI_STARTPROC movl %edx,%ecx rep movsd ret + CFI_ENDPROC +ENDPROC(__iowrite32_copy) diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S index 5554948b5554..967b22fa7d07 100644 --- a/arch/x86_64/lib/memcpy.S +++ b/arch/x86_64/lib/memcpy.S @@ -1,6 +1,10 @@ /* Copyright 2002 Andi Kleen */ - #include <asm/cpufeature.h> +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> +#include <asm/cpufeature.h> + /* * memcpy - Copy a memory block. * @@ -13,12 +17,26 @@ * rax original destination */ - .globl __memcpy - .globl memcpy - .p2align 4 -__memcpy: -memcpy: + ALIGN +memcpy_c: + CFI_STARTPROC + movq %rdi,%rax + movl %edx,%ecx + shrl $3,%ecx + andl $7,%edx + rep movsq + movl %edx,%ecx + rep movsb + ret + CFI_ENDPROC +ENDPROC(memcpy_c) + +ENTRY(__memcpy) +ENTRY(memcpy) + CFI_STARTPROC pushq %rbx + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rbx, 0 movq %rdi,%rax movl %edx,%ecx @@ -86,36 +104,27 @@ memcpy: .Lende: popq %rbx + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE rbx ret .Lfinal: + CFI_ENDPROC +ENDPROC(memcpy) +ENDPROC(__memcpy) /* Some CPUs run faster using the string copy instructions. It is also a lot simpler. Use this when possible */ + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memcpy - .quad memcpy_c - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal-memcpy - .byte memcpy_c_end-memcpy_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi source - * rdx count - */ -memcpy_c: - movq %rdi,%rax - movl %edx,%ecx - shrl $3,%ecx - andl $7,%edx - rep - movsq - movl %edx,%ecx - rep - movsb - ret -memcpy_c_end: + .quad memcpy + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memcpy + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c index e93d5255fdc9..751ebae8ec42 100644 --- a/arch/x86_64/lib/memmove.c +++ b/arch/x86_64/lib/memmove.c @@ -3,12 +3,13 @@ */ #define _STRING_C #include <linux/string.h> +#include <linux/module.h> #undef memmove void *memmove(void * dest,const void *src,size_t count) { if (dest < src) { - __inline_memcpy(dest,src,count); + return memcpy(dest,src,count); } else { char *p = (char *) dest + count; char *s = (char *) src + count; @@ -17,3 +18,4 @@ void *memmove(void * dest,const void *src,size_t count) } return dest; } +EXPORT_SYMBOL(memmove); diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index ad397f2c7de8..09ed1f6b0eaa 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,4 +1,9 @@ /* Copyright 2002 Andi Kleen, SuSE Labs */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <asm/dwarf2.h> + /* * ISO C memset - set a memory block to a byte value. * @@ -8,11 +13,29 @@ * * rax original destination */ - .globl __memset - .globl memset - .p2align 4 -memset: -__memset: + ALIGN +memset_c: + CFI_STARTPROC + movq %rdi,%r9 + movl %edx,%r8d + andl $7,%r8d + movl %edx,%ecx + shrl $3,%ecx + /* expand byte value */ + movzbl %sil,%esi + movabs $0x0101010101010101,%rax + mulq %rsi /* with rax, clobbers rdx */ + rep stosq + movl %r8d,%ecx + rep stosb + movq %r9,%rax + ret + CFI_ENDPROC +ENDPROC(memset_c) + +ENTRY(memset) +ENTRY(__memset) + CFI_STARTPROC movq %rdi,%r10 movq %rdx,%r11 @@ -25,6 +48,7 @@ __memset: movl %edi,%r9d andl $7,%r9d jnz .Lbad_alignment + CFI_REMEMBER_STATE .Lafter_bad_alignment: movl %r11d,%ecx @@ -75,6 +99,7 @@ __memset: movq %r10,%rax ret + CFI_RESTORE_STATE .Lbad_alignment: cmpq $7,%r11 jbe .Lhandle_7 @@ -84,42 +109,26 @@ __memset: addq %r8,%rdi subq %r8,%r11 jmp .Lafter_bad_alignment +.Lfinal: + CFI_ENDPROC +ENDPROC(memset) +ENDPROC(__memset) /* Some CPUs run faster using the string instructions. It is also a lot simpler. Use this when possible */ #include <asm/cpufeature.h> + .section .altinstr_replacement,"ax" +1: .byte 0xeb /* jmp <disp8> */ + .byte (memset_c - memset) - (2f - 1b) /* offset */ +2: + .previous .section .altinstructions,"a" .align 8 - .quad memset - .quad memset_c - .byte X86_FEATURE_REP_GOOD - .byte memset_c_end-memset_c - .byte memset_c_end-memset_c - .previous - - .section .altinstr_replacement,"ax" - /* rdi destination - * rsi value - * rdx count - */ -memset_c: - movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx - /* expand byte value */ - movzbl %sil,%esi - movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ - rep - stosq - movl %r8d,%ecx - rep - stosb - movq %r9,%rax - ret -memset_c_end: + .quad memset + .quad 1b + .byte X86_FEATURE_REP_GOOD + .byte .Lfinal - memset + .byte 2b - 1b .previous diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S index 7f5593974e2d..4989f5a8fa9b 100644 --- a/arch/x86_64/lib/putuser.S +++ b/arch/x86_64/lib/putuser.S @@ -25,25 +25,26 @@ */ #include <linux/linkage.h> +#include <asm/dwarf2.h> #include <asm/page.h> #include <asm/errno.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> .text - .p2align 4 -.globl __put_user_1 -__put_user_1: +ENTRY(__put_user_1) + CFI_STARTPROC GET_THREAD_INFO(%r8) cmpq threadinfo_addr_limit(%r8),%rcx jae bad_put_user 1: movb %dl,(%rcx) xorl %eax,%eax ret + CFI_ENDPROC +ENDPROC(__put_user_1) - .p2align 4 -.globl __put_user_2 -__put_user_2: +ENTRY(__put_user_2) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $1,%rcx jc 20f @@ -55,10 +56,11 @@ __put_user_2: ret 20: decq %rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_2) - .p2align 4 -.globl __put_user_4 -__put_user_4: +ENTRY(__put_user_4) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $3,%rcx jc 30f @@ -70,10 +72,11 @@ __put_user_4: ret 30: subq $3,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_4) - .p2align 4 -.globl __put_user_8 -__put_user_8: +ENTRY(__put_user_8) + CFI_STARTPROC GET_THREAD_INFO(%r8) addq $7,%rcx jc 40f @@ -85,10 +88,15 @@ __put_user_8: ret 40: subq $7,%rcx jmp bad_put_user + CFI_ENDPROC +ENDPROC(__put_user_8) bad_put_user: + CFI_STARTPROC movq $(-EFAULT),%rax ret + CFI_ENDPROC +END(bad_put_user) .section __ex_table,"a" .quad 1b,bad_put_user diff --git a/arch/x86_64/lib/rwlock.S b/arch/x86_64/lib/rwlock.S new file mode 100644 index 000000000000..0cde1f807314 --- /dev/null +++ b/arch/x86_64/lib/rwlock.S @@ -0,0 +1,38 @@ +/* Slow paths of read/write spinlocks. */ + +#include <linux/linkage.h> +#include <asm/rwlock.h> +#include <asm/alternative-asm.i> +#include <asm/dwarf2.h> + +/* rdi: pointer to rwlock_t */ +ENTRY(__write_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + addl $RW_LOCK_BIAS,(%rdi) +1: rep + nop + cmpl $RW_LOCK_BIAS,(%rdi) + jne 1b + LOCK_PREFIX + subl $RW_LOCK_BIAS,(%rdi) + jnz __write_lock_failed + ret + CFI_ENDPROC +END(__write_lock_failed) + +/* rdi: pointer to rwlock_t */ +ENTRY(__read_lock_failed) + CFI_STARTPROC + LOCK_PREFIX + incl (%rdi) +1: rep + nop + cmpl $1,(%rdi) + js 1b + LOCK_PREFIX + decl (%rdi) + js __read_lock_failed + ret + CFI_ENDPROC +END(__read_lock_failed) diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index e49af0032e94..0025535cac8d 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -1,10 +1,9 @@ - /* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Subject to the GNU public license, v.2. No warranty of any kind. - * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $ - */ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ #include <linux/config.h> #include <linux/linkage.h> @@ -47,6 +46,11 @@ thunk_retrax __down_failed_interruptible,__down_interruptible thunk_retrax __down_failed_trylock,__down_trylock thunk __up_wakeup,__up + +#ifdef CONFIG_TRACE_IRQFLAGS + thunk trace_hardirqs_on_thunk,trace_hardirqs_on + thunk trace_hardirqs_off_thunk,trace_hardirqs_off +#endif /* SAVE_ARGS below is used only for the .cfi directives it contains. */ CFI_STARTPROC @@ -62,33 +66,3 @@ restore_norax: RESTORE_ARGS 1 ret CFI_ENDPROC - -#ifdef CONFIG_SMP -/* Support for read/write spinlocks. */ - .text -/* rax: pointer to rwlock_t */ -ENTRY(__write_lock_failed) - lock - addl $RW_LOCK_BIAS,(%rax) -1: rep - nop - cmpl $RW_LOCK_BIAS,(%rax) - jne 1b - lock - subl $RW_LOCK_BIAS,(%rax) - jnz __write_lock_failed - ret - -/* rax: pointer to rwlock_t */ -ENTRY(__read_lock_failed) - lock - incl (%rax) -1: rep - nop - cmpl $1,(%rax) - js 1b - lock - decl (%rax) - js __read_lock_failed - ret -#endif diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c index 9bc2c295818e..893d43f838cc 100644 --- a/arch/x86_64/lib/usercopy.c +++ b/arch/x86_64/lib/usercopy.c @@ -5,6 +5,7 @@ * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen <ak@suse.de> */ +#include <linux/module.h> #include <asm/uaccess.h> /* @@ -47,15 +48,17 @@ __strncpy_from_user(char *dst, const char __user *src, long count) __do_strncpy_from_user(dst, src, count, res); return res; } +EXPORT_SYMBOL(__strncpy_from_user); long strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) - __do_strncpy_from_user(dst, src, count, res); + return __strncpy_from_user(dst, src, count); return res; } +EXPORT_SYMBOL(strncpy_from_user); /* * Zero Userspace @@ -94,7 +97,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size) [zero] "r" (0UL), [eight] "r" (8UL)); return size; } - +EXPORT_SYMBOL(__clear_user); unsigned long clear_user(void __user *to, unsigned long n) { @@ -102,6 +105,7 @@ unsigned long clear_user(void __user *to, unsigned long n) return __clear_user(to, n); return n; } +EXPORT_SYMBOL(clear_user); /* * Return the size of a string (including the ending 0) @@ -125,6 +129,7 @@ long __strnlen_user(const char __user *s, long n) s++; } } +EXPORT_SYMBOL(__strnlen_user); long strnlen_user(const char __user *s, long n) { @@ -132,6 +137,7 @@ long strnlen_user(const char __user *s, long n) return 0; return __strnlen_user(s, n); } +EXPORT_SYMBOL(strnlen_user); long strlen_user(const char __user *s) { @@ -147,6 +153,7 @@ long strlen_user(const char __user *s) s++; } } +EXPORT_SYMBOL(strlen_user); unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len) { @@ -155,3 +162,5 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le } return len; } +EXPORT_SYMBOL(copy_in_user); + diff --git a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c index 2d78f9fb4035..79ac6e7100af 100644 --- a/arch/x86_64/mm/extable.c +++ b/arch/x86_64/mm/extable.c @@ -2,7 +2,6 @@ * linux/arch/x86_64/mm/extable.c */ -#include <linux/config.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/init.h> diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 55250593d8c9..3751b4788e28 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -5,7 +5,6 @@ * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. */ -#include <linux/config.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -41,6 +40,35 @@ #define PF_RSVD (1<<3) #define PF_INSTR (1<<4) +static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} +EXPORT_SYMBOL_GPL(register_page_fault_notifier); + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); + +static inline int notify_page_fault(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig + }; + return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} + void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; @@ -68,7 +96,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char *instr; + unsigned char __user *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -77,7 +105,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, if (error_code & PF_INSTR) return 0; - instr = (unsigned char *)convert_rip_to_linear(current, regs); + instr = (unsigned char __user *)convert_rip_to_linear(current, regs); max_instr = instr + 15; if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) @@ -88,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; instr_hi = opcode & 0xf0; @@ -126,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, instr)) + if (__get_user(opcode, (char __user *)instr)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -142,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long *)p); + return __get_user(dummy, (unsigned long __user *)p); } void dump_pagetable(unsigned long address) @@ -160,7 +188,7 @@ void dump_pagetable(unsigned long address) printk("PGD %lx ", pgd_val(*pgd)); if (!pgd_present(*pgd)) goto ret; - pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); + pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); if (!pud_present(*pud)) goto ret; @@ -216,7 +244,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) int unhandled_signal(struct task_struct *tsk, int sig) { - if (tsk->pid == 1) + if (is_init(tsk)) return 1; if (tsk->ptrace & PT_PTRACED) return 0; @@ -265,7 +293,7 @@ static int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); /* Below here mismatches are bugs because these lower tables are shared */ @@ -274,7 +302,7 @@ static int vmalloc_fault(unsigned long address) pud_ref = pud_offset(pgd_ref, address); if (pud_none(*pud_ref)) return -1; - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) BUG(); pmd = pmd_offset(pud, address); pmd_ref = pmd_offset(pud_ref, address); @@ -348,7 +376,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (vmalloc_fault(address) >= 0) return; } - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; /* @@ -358,7 +386,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, goto bad_area_nosemaphore; } - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, + if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, SIGSEGV) == NOTIFY_STOP) return; @@ -383,7 +411,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occuring in a code path which already holds mmap_sem + * erroneous fault occurring in a code path which already holds mmap_sem * we will deadlock attempting to validate the fault against the * address space. Luckily the kernel only validly references user * space from well defined areas of code, which are listed in the @@ -410,8 +438,10 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; if (error_code & 4) { - // XXX: align red zone size with ABI - if (address + 128 < regs->rsp) + /* Allow userspace just enough access below the stack pointer + * to let the 'enter' instruction work. + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) goto bad_area; } if (expand_stack(vma, address)) @@ -434,7 +464,7 @@ good_area: case PF_PROT: /* read, present */ goto bad_area; case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; } @@ -534,7 +564,6 @@ no_context: printk(KERN_ALERT "Unable to handle kernel paging request"); printk(" at %016lx RIP: \n" KERN_ALERT,address); printk_address(regs->rip); - printk("\n"); dump_pagetable(address); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -551,7 +580,7 @@ no_context: */ out_of_memory: up_read(&mm->mmap_sem); - if (current->pid == 1) { + if (is_init(current)) { yield(); goto again; } @@ -606,7 +635,7 @@ void vmalloc_sync_all(void) if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else - BUG_ON(pgd_page(*pgd) != pgd_page(*pgd_ref)); + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } spin_unlock(&pgd_lock); set_bit(pgd_index(address), insync); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ba34e95d835..3e16fe08150e 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -6,7 +6,6 @@ * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de> */ -#include <linux/config.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -23,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/proc_fs.h> #include <linux/pci.h> +#include <linux/poison.h> #include <linux/dma-mapping.h> #include <linux/module.h> #include <linux/memory_hotplug.h> @@ -41,8 +41,6 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> -#include <asm/dma-mapping.h> -#include <asm/swiotlb.h> #ifndef Dprintk #define Dprintk(x...) @@ -90,8 +88,6 @@ void show_mem(void) printk(KERN_INFO "%lu pages swap cached\n",cached); } -/* References to section boundaries */ - int after_bootmem; static __init void *spp_getpage(void) @@ -233,7 +229,6 @@ __init void *early_ioremap(unsigned long addr, unsigned long size) /* actually usually some more */ if (size >= LARGE_PAGE_SIZE) { - printk("SMBIOS area too long %lu\n", size); return NULL; } set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); @@ -254,18 +249,24 @@ __init void early_iounmap(void *addr, unsigned long size) } static void __meminit -phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) { - int i; + int i = pmd_index(address); - for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) { + for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { unsigned long entry; + pmd_t *pmd = pmd_page + pmd_index(address); - if (address > end) { - for (; i < PTRS_PER_PMD; i++, pmd++) - set_pmd(pmd, __pmd(0)); + if (address >= end) { + if (!after_bootmem) + for (; i < PTRS_PER_PMD; i++, pmd++) + set_pmd(pmd, __pmd(0)); break; } + + if (pmd_val(*pmd)) + continue; + entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); @@ -275,45 +276,41 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address)); - - if (pmd_none(*pmd)) { - spin_lock(&init_mm.page_table_lock); - phys_pmd_init(pmd, address, end); - spin_unlock(&init_mm.page_table_lock); - __flush_tlb_all(); - } + pmd_t *pmd = pmd_offset(pud,0); + spin_lock(&init_mm.page_table_lock); + phys_pmd_init(pmd, address, end); + spin_unlock(&init_mm.page_table_lock); + __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) +static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) { - long i = pud_index(address); + int i = pud_index(addr); - pud = pud + i; - if (after_bootmem && pud_val(*pud)) { - phys_pmd_update(pud, address, end); - return; - } - - for (; i < PTRS_PER_PUD; pud++, i++) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { int map; - unsigned long paddr, pmd_phys; + unsigned long pmd_phys; + pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - paddr = (address & PGDIR_MASK) + i*PUD_SIZE; - if (paddr >= end) + if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(paddr, paddr+PUD_SIZE, 0)) { + if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { set_pud(pud, __pud(0)); continue; } + if (pud_val(*pud)) { + phys_pmd_update(pud, addr, end); + continue; + } + pmd = alloc_low_page(&map, &pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - phys_pmd_init(pmd, paddr, end); + phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); unmap_low_page(map); } @@ -341,7 +338,8 @@ static void __init find_early_table_space(unsigned long end) table_end = table_start; early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); + end, table_start << PAGE_SHIFT, + (table_start << PAGE_SHIFT) + tables); } /* Setup the direct mapping of the physical memory at PAGE_OFFSET. @@ -372,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) pud_t *pud; if (after_bootmem) - pud = pud_offset_k(pgd, start & PGDIR_MASK); + pud = pud_offset(pgd, start & PGDIR_MASK); else pud = alloc_low_page(&map, &pud_phys); @@ -405,69 +403,15 @@ void __cpuinit zap_low_mappings(int cpu) __flush_tlb_all(); } -/* Compute zone sizes for the DMA and DMA32 zones in a node. */ -__init void -size_zones(unsigned long *z, unsigned long *h, - unsigned long start_pfn, unsigned long end_pfn) -{ - int i; - unsigned long w; - - for (i = 0; i < MAX_NR_ZONES; i++) - z[i] = 0; - - if (start_pfn < MAX_DMA_PFN) - z[ZONE_DMA] = MAX_DMA_PFN - start_pfn; - if (start_pfn < MAX_DMA32_PFN) { - unsigned long dma32_pfn = MAX_DMA32_PFN; - if (dma32_pfn > end_pfn) - dma32_pfn = end_pfn; - z[ZONE_DMA32] = dma32_pfn - start_pfn; - } - z[ZONE_NORMAL] = end_pfn - start_pfn; - - /* Remove lower zones from higher ones. */ - w = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - if (z[i]) - z[i] -= w; - w += z[i]; - } - - /* Compute holes */ - w = start_pfn; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long s = w; - w += z[i]; - h[i] = e820_hole_size(s, w); - } - - /* Add the space pace needed for mem_map to the holes too. */ - for (i = 0; i < MAX_NR_ZONES; i++) - h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE; - - /* The 16MB DMA zone has the kernel and other misc mappings. - Account them too */ - if (h[ZONE_DMA]) { - h[ZONE_DMA] += dma_reserve; - if (h[ZONE_DMA] >= z[ZONE_DMA]) { - printk(KERN_WARNING - "Kernel too large and filling up ZONE_DMA?\n"); - h[ZONE_DMA] = z[ZONE_DMA]; - } - } -} - #ifndef CONFIG_NUMA void __init paging_init(void) { - unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; - + unsigned long max_zone_pfns[MAX_NR_ZONES] = {MAX_DMA_PFN, + MAX_DMA32_PFN, + end_pfn}; memory_present(0, 0, end_pfn); sparse_init(); - size_zones(zones, holes, 0, end_pfn); - free_area_init_node(0, NODE_DATA(0), zones, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); + free_area_init_nodes(max_zone_pfns); } #endif @@ -508,8 +452,6 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) /* * Memory hotplug specific functions */ -#if defined(CONFIG_ACPI_HOTPLUG_MEMORY) || defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE) - void online_page(struct page *page) { ClearPageReserved(page); @@ -519,31 +461,17 @@ void online_page(struct page *page) num_physpages++; } -#ifndef CONFIG_MEMORY_HOTPLUG +#ifdef CONFIG_MEMORY_HOTPLUG /* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. + * XXX: memory_add_physaddr_to_nid() is to find node id from physical address + * via probe interface of sysfs. If acpi notifies hot-add event, then it + * can tell node id by searching dsdt. But, probe interface doesn't have + * node id. So, return 0 as node id at this time. */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +#ifdef CONFIG_NUMA +int memory_add_physaddr_to_nid(u64 start) { - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; + return 0; } #endif @@ -551,10 +479,10 @@ int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int add_memory(u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size) { - struct pglist_data *pgdat = NODE_DATA(0); - struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = pgdat->node_zones + ZONE_NORMAL; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; @@ -570,7 +498,7 @@ error: printk("%s: Problem encountered in __add_pages!\n", __func__); return ret; } -EXPORT_SYMBOL_GPL(add_memory); +EXPORT_SYMBOL_GPL(arch_add_memory); int remove_memory(u64 start, u64 size) { @@ -578,7 +506,33 @@ int remove_memory(u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); -#endif +#else /* CONFIG_MEMORY_HOTPLUG */ +/* + * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, + * just online the pages. + */ +int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) +{ + int err = -EIO; + unsigned long pfn; + unsigned long total = 0, mem = 0; + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { + if (pfn_valid(pfn)) { + online_page(pfn_to_page(pfn)); + err = 0; + mem++; + } + total++; + } + if (!err) { + z->spanned_pages += total; + z->present_pages += mem; + z->zone_pgdat->node_spanned_pages += total; + z->zone_pgdat->node_present_pages += mem; + } + return err; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -587,16 +541,7 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; -#ifdef CONFIG_SWIOTLB - pci_swiotlb_init(); -#endif - no_iommu_init(); - - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - num_physpages = end_pfn; - high_memory = (void *) __va(end_pfn * PAGE_SIZE); + pci_iommu_alloc(); /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -609,7 +554,8 @@ void __init mem_init(void) #else totalram_pages = free_all_bootmem(); #endif - reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn); + reservedpages = end_pfn - totalram_pages - + absent_pages_in_range(0, end_pfn); after_bootmem = 1; @@ -644,34 +590,44 @@ void __init mem_init(void) #endif } -void free_initmem(void) +void free_init_pages(char *what, unsigned long begin, unsigned long end) { unsigned long addr; - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { + if (begin >= end) + return; + + printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); + memset((void *)(addr & ~(PAGE_SIZE-1)), + POISON_FREE_INITMEM, PAGE_SIZE); free_page(addr); totalram_pages++; } - memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); +} + +void free_initmem(void) +{ + memset(__initdata_begin, POISON_FREE_INITDATA, + __initdata_end - __initdata_begin); + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA -extern char __start_rodata, __end_rodata; void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)&__start_rodata; + unsigned long addr = (unsigned long)__start_rodata; - for (; addr < (unsigned long)&__end_rodata; addr += PAGE_SIZE) + for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); printk ("Write protecting the kernel read-only data: %luk\n", - (&__end_rodata - &__start_rodata) >> 10); + (__end_rodata - __start_rodata) >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. @@ -686,15 +642,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - if (start >= end) - return; - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - init_page_count(virt_to_page(start)); - free_page(start); - totalram_pages++; - } + free_init_pages("initrd memory", start, end); } #endif @@ -707,8 +655,10 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) #else reserve_bootmem(phys, len); #endif - if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; + set_dma_reserve(dma_reserve); + } } int kern_addr_valid(unsigned long addr) diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index ae207064201e..45d7d823c3b8 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -11,6 +11,7 @@ #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/io.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> @@ -219,6 +220,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l } return (__force void __iomem *) (offset + (char *)addr); } +EXPORT_SYMBOL(__ioremap); /** * ioremap_nocache - map bus memory into CPU space @@ -246,6 +248,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) { return __ioremap(phys_addr, size, _PAGE_PCD); } +EXPORT_SYMBOL(ioremap_nocache); /** * iounmap - Free a IO remapping @@ -291,3 +294,5 @@ void iounmap(volatile void __iomem *addr) BUG_ON(p != o || o == NULL); kfree(p); } +EXPORT_SYMBOL(iounmap); + diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index 7c45c2d2b8b2..b5b8dba28b4e 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -54,6 +54,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodes_clear(nodes_parsed); + if (!early_pci_allowed()) + return -1; + nb = find_northbridge(); if (nb < 0) return nb; @@ -146,6 +149,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) nodes[nodeid].start = base; nodes[nodeid].end = limit; + e820_register_active_regions(nodeid, + nodes[nodeid].start >> PAGE_SHIFT, + nodes[nodeid].end >> PAGE_SHIFT); prevbase = base; diff --git a/arch/x86_64/mm/mmap.c b/arch/x86_64/mm/mmap.c index 43e9b99bdf25..80bba0dc000e 100644 --- a/arch/x86_64/mm/mmap.c +++ b/arch/x86_64/mm/mmap.c @@ -1,7 +1,6 @@ /* Copyright 2005 Andi Kleen, SuSE Labs. * Licensed under GPL, v.2 */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/random.h> diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index b2fac14baac0..829a008bd39b 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -161,7 +161,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); - e820_bootmem_free(NODE_DATA(nodeid), start, end); + free_bootmem_with_active_regions(nodeid, end); reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); @@ -175,13 +175,11 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en void __init setup_node_zones(int nodeid) { unsigned long start_pfn, end_pfn, memmapsize, limit; - unsigned long zones[MAX_NR_ZONES]; - unsigned long holes[MAX_NR_ZONES]; start_pfn = node_start_pfn(nodeid); end_pfn = node_end_pfn(nodeid); - Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n", + Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", nodeid, start_pfn, end_pfn); /* Try to allocate mem_map at end to not fill up precious <4GB @@ -195,10 +193,6 @@ void __init setup_node_zones(int nodeid) round_down(limit - memmapsize, PAGE_SIZE), limit); #endif - - size_zones(zones, holes, start_pfn, end_pfn); - free_area_init_node(nodeid, NODE_DATA(nodeid), zones, - start_pfn, holes); } void __init numa_init_array(void) @@ -225,7 +219,7 @@ void __init numa_init_array(void) int numa_fake __initdata = 0; /* Numa emulation */ -static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) +static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { int i; struct bootnode nodes[MAX_NUMNODES]; @@ -259,8 +253,11 @@ static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn) printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); return -1; } - for_each_online_node(i) + for_each_online_node(i) { + e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } numa_init_array(); return 0; } @@ -299,6 +296,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); node_to_cpumask[0] = cpumask_of_cpu(0); + e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } @@ -340,17 +338,23 @@ static void __init arch_sparse_init(void) void __init paging_init(void) { int i; + unsigned long max_zone_pfns[MAX_NR_ZONES] = { MAX_DMA_PFN, + MAX_DMA32_PFN, + end_pfn}; arch_sparse_init(); for_each_online_node(i) { setup_node_zones(i); } + + free_area_init_nodes(max_zone_pfns); } -/* [numa=off] */ -__init int numa_setup(char *opt) +static __init int numa_setup(char *opt) { + if (!opt) + return -EINVAL; if (!strncmp(opt,"off",3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU @@ -366,9 +370,11 @@ __init int numa_setup(char *opt) if (!strncmp(opt,"hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif - return 1; + return 0; } +early_param("numa", numa_setup); + /* * Setup early cpu_to_node. * diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 531ad21447b1..3e231d762aaa 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -3,7 +3,6 @@ * Thanks to Ben LaHaise for precious feedback. */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/highmem.h> @@ -109,8 +108,8 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pgprot_val(ref_prot) |= _PAGE_PSE; large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); + large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); } @@ -120,32 +119,28 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, { pte_t *kpte; struct page *kpte_page; - unsigned kpte_flags; pgprot_t ref_prot2; kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - kpte_flags = pte_val(*kpte); if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if ((kpte_flags & _PAGE_PSE) == 0) { + if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); } else { /* * split_large_page will take the reference for this * change_page_attr on the split page. */ - struct page *split; - ref_prot2 = __pgprot(pgprot_val(pte_pgprot(*lookup_address(address))) & ~(1<<_PAGE_BIT_PSE)); - + ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); split = split_large_page(address, prot, ref_prot2); if (!split) return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot2)); + set_pte(kpte, mk_pte(split, ref_prot2)); kpte_page = split; - } + } page_private(kpte_page)++; - } else if ((kpte_flags & _PAGE_PSE) == 0) { + } else if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, ref_prot)); BUG_ON(page_private(kpte_page) == 0); page_private(kpte_page)--; @@ -191,10 +186,12 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) * lowmem */ if (__pa(address) < KERNEL_TEXT_SIZE) { unsigned long addr2; - pgprot_t prot2 = prot; + pgprot_t prot2; addr2 = __START_KERNEL_map + __pa(address); - pgprot_val(prot2) &= ~_PAGE_NX; - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); + /* Make sure the kernel mappings stay executable */ + prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); + err = __change_page_attr(addr2, pfn, prot2, + PAGE_KERNEL_EXEC); } } up_write(&init_mm.mmap_sem); diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 502fce65e96a..f8c04d6935c9 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -21,6 +21,8 @@ #include <asm/numa.h> #include <asm/e820.h> +int acpi_numa __initdata; + #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \ defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \ && !defined(CONFIG_MEMORY_HOTPLUG) @@ -91,6 +93,7 @@ static __init void bad_srat(void) apicid_to_node[i] = NUMA_NO_NODE; for (i = 0; i < MAX_NUMNODES; i++) nodes_add[i].start = nodes[i].end = 0; + remove_all_active_ranges(); } static __init inline int srat_disabled(void) @@ -173,7 +176,7 @@ static int hotadd_enough_memory(struct bootnode *nd) if (mem < 0) return 0; - allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE; + allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; allowed = (allowed / 100) * hotadd_percent; if (allocated + mem > allowed) { unsigned long range; @@ -223,8 +226,10 @@ static int reserve_hotadd(int node, unsigned long start, unsigned long end) } /* This check might be a bit too strict, but I'm keeping it for now. */ - if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) { - printk(KERN_ERR "SRAT: Hotplug area has existing memory\n"); + if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { + printk(KERN_ERR + "SRAT: Hotplug area %lu -> %lu has existing memory\n", + s_pfn, e_pfn); return -1; } @@ -317,6 +322,10 @@ acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma) printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, nd->start, nd->end); + e820_register_active_regions(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); + push_node_boundaries(node, nd->start >> PAGE_SHIFT, + nd->end >> PAGE_SHIFT); #ifdef RESERVE_HOTADD if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) { @@ -341,13 +350,13 @@ static int nodes_cover_memory(void) unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= e820_hole_size(s, e); + pxmram -= absent_pages_in_range(s, e); pxmram -= nodes_add[i].end - nodes_add[i].start; if ((long)pxmram < 0) pxmram = 0; } - e820ram = end_pfn - e820_hole_size(0, end_pfn); + e820ram = end_pfn - absent_pages_in_range(0, end_pfn); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR diff --git a/arch/x86_64/pci/Makefile b/arch/x86_64/pci/Makefile index a3f6ad570179..1eb18f421edf 100644 --- a/arch/x86_64/pci/Makefile +++ b/arch/x86_64/pci/Makefile @@ -9,7 +9,7 @@ obj-y := i386.o obj-$(CONFIG_PCI_DIRECT)+= direct.o obj-y += fixup.o init.o obj-$(CONFIG_ACPI) += acpi.o -obj-y += legacy.o irq.o common.o +obj-y += legacy.o irq.o common.o early.o # mmconfig has a 64bit special obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o @@ -23,3 +23,4 @@ common-y += ../../i386/pci/common.o fixup-y += ../../i386/pci/fixup.o i386-y += ../../i386/pci/i386.o init-y += ../../i386/pci/init.o +early-y += ../../i386/pci/early.o diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index 3c55c76c6fd5..7732f4254d21 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -156,15 +156,45 @@ static __init void unreachable_devices(void) addr = pci_dev_base(0, k, PCI_DEVFN(i, 0)); if (addr == NULL|| readl(addr) != val1) { set_bit(i + 32*k, fallback_slots); - printk(KERN_NOTICE - "PCI: No mmconfig possible on device %x:%x\n", - k, i); + printk(KERN_NOTICE "PCI: No mmconfig possible" + " on device %02x:%02x\n", k, i); } } } } -void __init pci_mmcfg_init(void) +static __init void pci_mmcfg_insert_resources(void) +{ +#define PCI_MMCFG_RESOURCE_NAME_LEN 19 + int i; + struct resource *res; + char *names; + unsigned num_buses; + + res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), + pci_mmcfg_config_num, GFP_KERNEL); + + if (!res) { + printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); + return; + } + + names = (void *)&res[pci_mmcfg_config_num]; + for (i = 0; i < pci_mmcfg_config_num; i++, res++) { + num_buses = pci_mmcfg_config[i].end_bus_number - + pci_mmcfg_config[i].start_bus_number + 1; + res->name = names; + snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", + pci_mmcfg_config[i].pci_segment_group_number); + res->start = pci_mmcfg_config[i].base_address; + res->end = res->start + (num_buses << 20) - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + insert_resource(&iomem_resource, res); + names += PCI_MMCFG_RESOURCE_NAME_LEN; + } +} + +void __init pci_mmcfg_init(int type) { int i; @@ -177,7 +207,9 @@ void __init pci_mmcfg_init(void) (pci_mmcfg_config[0].base_address == 0)) return; - if (!e820_all_mapped(pci_mmcfg_config[0].base_address, + /* Only do this check when type 1 works. If it doesn't work + assume we run on a Mac and always use MCFG */ + if (type == 1 && !e820_all_mapped(pci_mmcfg_config[0].base_address, pci_mmcfg_config[0].base_address + MMCONFIG_APER_MIN, E820_RESERVED)) { printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %x is not E820-reserved\n", @@ -186,7 +218,6 @@ void __init pci_mmcfg_init(void) return; } - /* RED-PEN i386 doesn't do _nocache right now */ pci_mmcfg_virt = kmalloc(sizeof(*pci_mmcfg_virt) * pci_mmcfg_config_num, GFP_KERNEL); if (pci_mmcfg_virt == NULL) { printk("PCI: Can not allocate memory for mmconfig structures\n"); @@ -205,6 +236,7 @@ void __init pci_mmcfg_init(void) } unreachable_devices(); + pci_mmcfg_insert_resources(); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; |

