diff options
Diffstat (limited to 'arch')
140 files changed, 4828 insertions, 3628 deletions
diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c index ba788cfdc3c6..9fc0eeb4f0ab 100644 --- a/arch/alpha/oprofile/common.c +++ b/arch/alpha/oprofile/common.c @@ -112,7 +112,7 @@ op_axp_create_files(struct super_block * sb, struct dentry * root)  	for (i = 0; i < model->num_counters; ++i) {  		struct dentry *dir; -		char buf[3]; +		char buf[4];  		snprintf(buf, sizeof buf, "%d", i);  		dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 282b14e2f464..a3bbaaf480b9 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -177,7 +177,7 @@ boot := arch/arm/boot  #	them changed.  We use .arch to indicate when they were updated  #	last, otherwise make uses the target directory mtime. -include/asm-arm/.arch: $(wildcard include/config/arch/*.h) include/config/MARKER +include/asm-arm/.arch: $(wildcard include/config/arch/*.h) include/config/auto.conf  	@echo '  SYMLINK include/asm-arm/arch -> include/asm-arm/$(INCDIR)'  ifneq ($(KBUILD_SRC),)  	$(Q)mkdir -p include/asm-arm diff --git a/arch/arm/common/locomo.c b/arch/arm/common/locomo.c index a7dc1370695b..0dafba3a701d 100644 --- a/arch/arm/common/locomo.c +++ b/arch/arm/common/locomo.c @@ -629,21 +629,6 @@ static int locomo_resume(struct platform_device *dev)  #endif -#define LCM_ALC_EN	0x8000 - -void frontlight_set(struct locomo *lchip, int duty, int vr, int bpwf) -{ -	unsigned long flags; - -	spin_lock_irqsave(&lchip->lock, flags); -	locomo_writel(bpwf, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); -	udelay(100); -	locomo_writel(duty, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); -	locomo_writel(bpwf | LCM_ALC_EN, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); -	spin_unlock_irqrestore(&lchip->lock, flags); -} - -  /**   *	locomo_probe - probe for a single LoCoMo chip.   *	@phys_addr: physical address of device. @@ -698,14 +683,10 @@ __locomo_probe(struct device *me, struct resource *mem, int irq)  			, lchip->base + LOCOMO_GPD);  	locomo_writel(0, lchip->base + LOCOMO_GIE); -	/* FrontLight */ +	/* Frontlight */  	locomo_writel(0, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS);  	locomo_writel(0, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); -	/* Same constants can be used for collie and poodle -	   (depending on CONFIG options in original sharp code)? */ -	frontlight_set(lchip, 163, 0, 148); -  	/* Longtime timer */  	locomo_writel(0, lchip->base + LOCOMO_LTINT);  	/* SPI */ @@ -1063,6 +1044,30 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int  }  /* + *	Frontlight control + */ + +static struct locomo *locomo_chip_driver(struct locomo_dev *ldev); + +void locomo_frontlight_set(struct locomo_dev *dev, int duty, int vr, int bpwf) +{ +	unsigned long flags; +	struct locomo *lchip = locomo_chip_driver(dev); + +	if (vr) +		locomo_gpio_write(dev, LOCOMO_GPIO_FL_VR, 1); +	else +		locomo_gpio_write(dev, LOCOMO_GPIO_FL_VR, 0); + +	spin_lock_irqsave(&lchip->lock, flags); +	locomo_writel(bpwf, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); +	udelay(100); +	locomo_writel(duty, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALD); +	locomo_writel(bpwf | LOCOMO_ALC_EN, lchip->base + LOCOMO_FRONTLIGHT + LOCOMO_ALS); +	spin_unlock_irqrestore(&lchip->lock, flags); +} + +/*   *	LoCoMo "Register Access Bus."   *   *	We model this as a regular bus type, and hang devices directly diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index e14b20783995..47c08bcd9b24 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -14,6 +14,10 @@ config X86_32  	  486, 586, Pentiums, and various instruction-set-compatible chips by  	  AMD, Cyrix, and others. +config GENERIC_TIME +	bool +	default y +  config SEMAPHORE_SLEEPERS  	bool  	default y @@ -324,6 +328,15 @@ config X86_MCE_P4THERMAL  	  Enabling this feature will cause a message to be printed when the P4  	  enters thermal throttling. +config VM86 +	default y +	bool "Enable VM86 support" if EMBEDDED +	help +          This option is required by programs like DOSEMU to run 16-bit legacy +	  code on X86 processors. It also may be needed by software like +          XFree86 to initialize some video cards via BIOS. Disabling this +          option saves about 6k. +  config TOSHIBA  	tristate "Toshiba Laptop support"  	---help--- @@ -1046,13 +1059,27 @@ config SCx200  	tristate "NatSemi SCx200 support"  	depends on !X86_VOYAGER  	help -	  This provides basic support for the National Semiconductor SCx200 -	  processor.  Right now this is just a driver for the GPIO pins. +	  This provides basic support for National Semiconductor's +	  (now AMD's) Geode processors.  The driver probes for the +	  PCI-IDs of several on-chip devices, so its a good dependency +	  for other scx200_* drivers. -	  If you don't know what to do here, say N. +	  If compiled as a module, the driver is named scx200. -	  This support is also available as a module.  If compiled as a -	  module, it will be called scx200. +config SCx200HR_TIMER +	tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" +	depends on SCx200 && GENERIC_TIME +	default y +	help +	  This driver provides a clocksource built upon the on-chip +	  27MHz high-resolution timer.  Its also a workaround for +	  NSC Geode SC-1100's buggy TSC, which loses time when the +	  processor goes idle (as is done by the scheduler).  The +	  other workaround is idle=poll boot option. + +config K8_NB +	def_bool y +	depends on AGP_AMD64  source "drivers/pcmcia/Kconfig" diff --git a/arch/i386/boot/Makefile b/arch/i386/boot/Makefile index 33e55476381b..e97946626064 100644 --- a/arch/i386/boot/Makefile +++ b/arch/i386/boot/Makefile @@ -109,8 +109,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf  isoimage: $(BOOTIMAGE)  	-rm -rf $(obj)/isoimage  	mkdir $(obj)/isoimage -	cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \ -		$(obj)/isoimage +	for i in lib lib64 share end ; do \ +		if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ +			cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ +			break ; \ +		fi ; \ +		if [ $$i = end ] ; then exit 1 ; fi ; \ +	done  	cp $(BOOTIMAGE) $(obj)/isoimage/linux  	echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg  	if [ -f '$(FDINITRD)' ] ; then \ diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index f19f3a7492a5..b2ccd543410d 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -24,14 +24,6 @@  #undef memset  #undef memcpy - -/* - * Why do we do this? Don't ask me.. - * - * Incomprehensible are the ways of bootloaders. - */ -static void* memset(void *, int, size_t); -static void* memcpy(void *, __const void *, size_t);  #define memzero(s, n)     memset ((s), 0, (n))  typedef unsigned char  uch; @@ -93,7 +85,7 @@ static unsigned char *real_mode; /* Pointer to real-mode data */  #endif  #define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) -extern char input_data[]; +extern unsigned char input_data[];  extern int input_len;  static long bytes_out = 0; @@ -103,6 +95,9 @@ static unsigned long output_ptr = 0;  static void *malloc(int size);  static void free(void *where); +static void *memset(void *s, int c, unsigned n); +static void *memcpy(void *dest, const void *src, unsigned n); +  static void putstr(const char *);  extern int end; @@ -205,7 +200,7 @@ static void putstr(const char *s)  	outb_p(0xff & (pos >> 1), vidport+1);  } -static void* memset(void* s, int c, size_t n) +static void* memset(void* s, int c, unsigned n)  {  	int i;  	char *ss = (char*)s; @@ -214,14 +209,13 @@ static void* memset(void* s, int c, size_t n)  	return s;  } -static void* memcpy(void* __dest, __const void* __src, -			    size_t __n) +static void* memcpy(void* dest, const void* src, unsigned n)  {  	int i; -	char *d = (char *)__dest, *s = (char *)__src; +	char *d = (char *)dest, *s = (char *)src; -	for (i=0;i<__n;i++) d[i] = s[i]; -	return __dest; +	for (i=0;i<n;i++) d[i] = s[i]; +	return dest;  }  /* =========================================================================== @@ -309,7 +303,7 @@ static void setup_normal_output_buffer(void)  #else  	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");  #endif -	output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */ +	output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */  	free_mem_end_ptr = (long)real_mode;  } @@ -324,11 +318,9 @@ static void setup_output_buffer_if_we_run_high(struct moveparams *mv)  #ifdef STANDARD_MEMORY_BIOS_CALL  	if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");  #else -	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < -			(3*1024)) -		error("Less than 4MB of memory"); +	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");  #endif	 -	mv->low_buffer_start = output_data = (char *)LOW_BUFFER_START; +	mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;  	low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX  	  ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff;  	low_buffer_size = low_buffer_end - LOW_BUFFER_START; diff --git a/arch/i386/boot/video.S b/arch/i386/boot/video.S index c9343c3a8082..8c2a6faeeae5 100644 --- a/arch/i386/boot/video.S +++ b/arch/i386/boot/video.S @@ -1929,7 +1929,7 @@ skip10:	movb	%ah, %al  	ret  store_edid: -#ifdef CONFIG_FB_FIRMWARE_EDID +#ifdef CONFIG_FIRMWARE_EDID  	pushw	%es				# just save all registers  	pushw	%ax  	pushw	%bx @@ -1947,6 +1947,22 @@ store_edid:  	rep  	stosl +	pushw   %es				# save ES +	xorw    %di, %di                        # Report Capability +	pushw   %di +	popw    %es                             # ES:DI must be 0:0 +	movw	$0x4f15, %ax +	xorw	%bx, %bx +	xorw	%cx, %cx +	int	$0x10 +	popw    %es                             # restore ES + +	cmpb    $0x00, %ah                      # call successful +	jne     no_edid + +	cmpb    $0x4f, %al                      # function supported +	jne     no_edid +  	movw	$0x4f15, %ax                    # do VBE/DDC  	movw	$0x01, %bx  	movw	$0x00, %cx @@ -1954,6 +1970,7 @@ store_edid:  	movw	$0x140, %di  	int	$0x10 +no_edid:  	popw	%di				# restore all registers  	popw	%dx  	popw	%cx diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S index 911b15377f2e..f942f0c8f630 100644 --- a/arch/i386/crypto/aes-i586-asm.S +++ b/arch/i386/crypto/aes-i586-asm.S @@ -36,22 +36,19 @@  .file "aes-i586-asm.S"  .text -// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// -// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])// -	 -#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words) +#include <asm/asm-offsets.h> -// offsets to parameters with one register pushed onto stack - -#define in_blk    8  // input byte array address parameter -#define out_blk  12  // output byte array address parameter -#define ctx      16  // AES context structure +#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words) -// offsets in context structure +/* offsets to parameters with one register pushed onto stack */ +#define tfm 8 +#define out_blk 12 +#define in_blk 16 -#define ekey     0   // encryption key schedule base address -#define nrnd   256   // number of rounds -#define dkey   260   // decryption key schedule base address +/* offsets in crypto_tfm structure */ +#define ekey (crypto_tfm_ctx_offset + 0) +#define nrnd (crypto_tfm_ctx_offset + 256) +#define dkey (crypto_tfm_ctx_offset + 260)  // register mapping for encrypt and decrypt subroutines @@ -220,6 +217,7 @@  	do_col (table, r5,r0,r1,r4, r2,r3);		/* idx=r5 */  // AES (Rijndael) Encryption Subroutine +/* void aes_enc_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */  .global  aes_enc_blk @@ -230,7 +228,7 @@  aes_enc_blk:  	push    %ebp -	mov     ctx(%esp),%ebp      // pointer to context +	mov     tfm(%esp),%ebp  // CAUTION: the order and the values used in these assigns   // rely on the register mappings @@ -295,6 +293,7 @@ aes_enc_blk:  	ret  // AES (Rijndael) Decryption Subroutine +/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out_blk, const u8 *in_blk) */  .global  aes_dec_blk @@ -305,7 +304,7 @@ aes_enc_blk:  aes_dec_blk:  	push    %ebp -	mov     ctx(%esp),%ebp       // pointer to context +	mov     tfm(%esp),%ebp  // CAUTION: the order and the values used in these assigns   // rely on the register mappings diff --git a/arch/i386/crypto/aes.c b/arch/i386/crypto/aes.c index a50397b1d5c7..d3806daa3de3 100644 --- a/arch/i386/crypto/aes.c +++ b/arch/i386/crypto/aes.c @@ -45,8 +45,8 @@  #include <linux/crypto.h>  #include <linux/linkage.h> -asmlinkage void aes_enc_blk(const u8 *src, u8 *dst, void *ctx); -asmlinkage void aes_dec_blk(const u8 *src, u8 *dst, void *ctx); +asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); +asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);  #define AES_MIN_KEY_SIZE	16  #define AES_MAX_KEY_SIZE	32 @@ -378,12 +378,12 @@ static void gen_tabs(void)  	k[8*(i)+11] = ss[3];						\  } -static int -aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags) +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +		       unsigned int key_len, u32 *flags)  {  	int i;  	u32 ss[8]; -	struct aes_ctx *ctx = ctx_arg; +	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);  	const __le32 *key = (const __le32 *)in_key;  	/* encryption schedule */ @@ -464,16 +464,16 @@ aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, u32 *flags)  	return 0;  } -static inline void aes_encrypt(void *ctx, u8 *dst, const u8 *src) +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	aes_enc_blk(src, dst, ctx); +	aes_enc_blk(tfm, dst, src);  } -static inline void aes_decrypt(void *ctx, u8 *dst, const u8 *src) + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	aes_dec_blk(src, dst, ctx); +	aes_dec_blk(tfm, dst, src);  } -  static struct crypto_alg aes_alg = {  	.cra_name		=	"aes",  	.cra_driver_name	=	"aes-i586", diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 96fb8a020af2..5e70c2fb273a 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -7,10 +7,9 @@ extra-y := head.o init_task.o vmlinux.lds  obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \  		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \  		pci-dma.o i386_ksyms.o i387.o bootflag.o \ -		quirks.o i8237.o topology.o alternative.o +		quirks.o i8237.o topology.o alternative.o i8253.o tsc.o  obj-y				+= cpu/ -obj-y				+= timers/  obj-y				+= acpi/  obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o  obj-$(CONFIG_MCA)		+= mca.o @@ -37,6 +36,8 @@ obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o  obj-$(CONFIG_DOUBLEFAULT) 	+= doublefault.o  obj-$(CONFIG_VM86)		+= vm86.o  obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o +obj-$(CONFIG_HPET_TIMER) 	+= hpet.o +obj-$(CONFIG_K8_NB)		+= k8.o  EXTRA_AFLAGS   := -traditional @@ -76,3 +77,6 @@ SYSCFLAGS_vsyscall-syms.o = -r  $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \  			$(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE  	$(call if_changed,syscall) + +k8-y                      += ../../x86_64/kernel/k8.o + diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 5cbd6f99fb2a..50eb0e03777e 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -4,27 +4,41 @@  #include <asm/alternative.h>  #include <asm/sections.h> -#define DEBUG 0 -#if DEBUG -# define DPRINTK(fmt, args...) printk(fmt, args) -#else -# define DPRINTK(fmt, args...) -#endif +static int no_replacement    = 0; +static int smp_alt_once      = 0; +static int debug_alternative = 0; + +static int __init noreplacement_setup(char *s) +{ +	no_replacement = 1; +	return 1; +} +static int __init bootonly(char *str) +{ +	smp_alt_once = 1; +	return 1; +} +static int __init debug_alt(char *str) +{ +	debug_alternative = 1; +	return 1; +} +__setup("noreplacement", noreplacement_setup); +__setup("smp-alt-boot", bootonly); +__setup("debug-alternative", debug_alt); + +#define DPRINTK(fmt, args...) if (debug_alternative) \ +	printk(KERN_DEBUG fmt, args) + +#ifdef GENERIC_NOP1  /* Use inline assembly to define this because the nops are defined     as inline assembly strings in the include files and we cannot     get them easily into strings. */  asm("\t.data\nintelnops: "  	GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6  	GENERIC_NOP7 GENERIC_NOP8); -asm("\t.data\nk8nops: " -	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 -	K8_NOP7 K8_NOP8); -asm("\t.data\nk7nops: " -	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 -	K7_NOP7 K7_NOP8); - -extern unsigned char intelnops[], k8nops[], k7nops[]; +extern unsigned char intelnops[];  static unsigned char *intel_nops[ASM_NOP_MAX+1] = {  	NULL,  	intelnops, @@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = {  	intelnops + 1 + 2 + 3 + 4 + 5 + 6,  	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,  }; +#endif + +#ifdef K8_NOP1 +asm("\t.data\nk8nops: " +	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 +	K8_NOP7 K8_NOP8); +extern unsigned char k8nops[];  static unsigned char *k8_nops[ASM_NOP_MAX+1] = {  	NULL,  	k8nops, @@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = {  	k8nops + 1 + 2 + 3 + 4 + 5 + 6,  	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,  }; +#endif + +#ifdef K7_NOP1 +asm("\t.data\nk7nops: " +	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 +	K7_NOP7 K7_NOP8); +extern unsigned char k7nops[];  static unsigned char *k7_nops[ASM_NOP_MAX+1] = {  	NULL,  	k7nops, @@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = {  	k7nops + 1 + 2 + 3 + 4 + 5 + 6,  	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,  }; +#endif + +#ifdef CONFIG_X86_64 + +extern char __vsyscall_0; +static inline unsigned char** find_nop_table(void) +{ +	return k8_nops; +} + +#else /* CONFIG_X86_64 */ +  static struct nop {  	int cpuid;  	unsigned char **noptable; @@ -67,14 +107,6 @@ static struct nop {  	{ -1, NULL }  }; - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; - -extern u8 __smp_alt_begin[], __smp_alt_end[]; - -  static unsigned char** find_nop_table(void)  {  	unsigned char **noptable = intel_nops; @@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void)  	return noptable;  } +#endif /* CONFIG_X86_64 */ + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; +extern u8 *__smp_locks[], *__smp_locks_end[]; + +extern u8 __smp_alt_begin[], __smp_alt_end[]; +  /* Replace instructions with better alternatives for this CPU type.     This runs before SMP is initialized to avoid SMP problems with     self modifying code. This implies that assymetric systems where @@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)  {  	unsigned char **noptable = find_nop_table();  	struct alt_instr *a; +	u8 *instr;  	int diff, i, k;  	DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); @@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)  		BUG_ON(a->replacementlen > a->instrlen);  		if (!boot_cpu_has(a->cpuid))  			continue; -		memcpy(a->instr, a->replacement, a->replacementlen); +		instr = a->instr; +#ifdef CONFIG_X86_64 +		/* vsyscall code is not mapped yet. resolve it manually. */ +		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { +			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); +			DPRINTK("%s: vsyscall fixup: %p => %p\n", +				__FUNCTION__, a->instr, instr); +		} +#endif +		memcpy(instr, a->replacement, a->replacementlen);  		diff = a->instrlen - a->replacementlen;  		/* Pad the rest with nops */  		for (i = a->replacementlen; diff > 0; diff -= k, i += k) { @@ -186,14 +236,6 @@ struct smp_alt_module {  static LIST_HEAD(smp_alt_modules);  static DEFINE_SPINLOCK(smp_alt); -static int smp_alt_once = 0; -static int __init bootonly(char *str) -{ -	smp_alt_once = 1; -	return 1; -} -__setup("smp-alt-boot", bootonly); -  void alternatives_smp_module_add(struct module *mod, char *name,  				 void *locks, void *locks_end,  				 void *text,  void *text_end) @@ -201,6 +243,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,  	struct smp_alt_module *smp;  	unsigned long flags; +	if (no_replacement) +		return; +  	if (smp_alt_once) {  		if (boot_cpu_has(X86_FEATURE_UP))  			alternatives_smp_unlock(locks, locks_end, @@ -235,7 +280,7 @@ void alternatives_smp_module_del(struct module *mod)  	struct smp_alt_module *item;  	unsigned long flags; -	if (smp_alt_once) +	if (no_replacement || smp_alt_once)  		return;  	spin_lock_irqsave(&smp_alt, flags); @@ -256,7 +301,7 @@ void alternatives_smp_switch(int smp)  	struct smp_alt_module *mod;  	unsigned long flags; -	if (smp_alt_once) +	if (no_replacement || smp_alt_once)  		return;  	BUG_ON(!smp && (num_online_cpus() > 1)); @@ -285,6 +330,13 @@ void alternatives_smp_switch(int smp)  void __init alternative_instructions(void)  { +	if (no_replacement) { +		printk(KERN_INFO "(SMP-)alternatives turned off\n"); +		free_init_pages("SMP alternatives", +				(unsigned long)__smp_alt_begin, +				(unsigned long)__smp_alt_end); +		return; +	}  	apply_alternatives(__alt_instructions, __alt_instructions_end);  	/* switch to patch-once-at-boottime-only mode and free the diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 5ab59c12335b..7ce09492fc0c 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -36,6 +36,7 @@  #include <asm/arch_hooks.h>  #include <asm/hpet.h>  #include <asm/i8253.h> +#include <asm/nmi.h>  #include <mach_apic.h>  #include <mach_apicdef.h> @@ -156,7 +157,7 @@ void clear_local_APIC(void)  	maxlvt = get_maxlvt();  	/* -	 * Masking an LVT entry on a P6 can trigger a local APIC error +	 * Masking an LVT entry can trigger a local APIC error  	 * if the vector is zero. Mask LVTERR first to prevent this.  	 */  	if (maxlvt >= 3) { @@ -1117,7 +1118,18 @@ void disable_APIC_timer(void)  		unsigned long v;  		v = apic_read(APIC_LVTT); -		apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); +		/* +		 * When an illegal vector value (0-15) is written to an LVT +		 * entry and delivery mode is Fixed, the APIC may signal an +		 * illegal vector error, with out regard to whether the mask +		 * bit is set or whether an interrupt is actually seen on input. +		 * +		 * Boot sequence might call this function when the LVTT has +		 * '0' vector value. So make sure vector field is set to +		 * valid value. +		 */ +		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); +		apic_write_around(APIC_LVTT, v);  	}  } diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 9e819eb68229..7c5729d1fd06 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -764,9 +764,9 @@ static int apm_do_idle(void)  	int	idled = 0;  	int	polling; -	polling = test_thread_flag(TIF_POLLING_NRFLAG); +	polling = !!(current_thread_info()->status & TS_POLLING);  	if (polling) { -		clear_thread_flag(TIF_POLLING_NRFLAG); +		current_thread_info()->status &= ~TS_POLLING;  		smp_mb__after_clear_bit();  	}  	if (!need_resched()) { @@ -774,7 +774,7 @@ static int apm_do_idle(void)  		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);  	}  	if (polling) -		set_thread_flag(TIF_POLLING_NRFLAG); +		current_thread_info()->status |= TS_POLLING;  	if (!idled)  		return 0; diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 36d66e2077d0..1c3a809e6421 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -4,6 +4,7 @@   * to extract and format the required data.   */ +#include <linux/crypto.h>  #include <linux/sched.h>  #include <linux/signal.h>  #include <linux/personality.h> @@ -69,4 +70,6 @@ void foo(void)  	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);  	DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); + +	OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);  } diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 786d1a57048b..fd0457c9c827 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -224,15 +224,17 @@ static void __init init_amd(struct cpuinfo_x86 *c)  #ifdef CONFIG_X86_HT  	/* -	 * On a AMD dual core setup the lower bits of the APIC id -	 * distingush the cores.  Assumes number of cores is a power -	 * of two. +	 * On a AMD multi core setup the lower bits of the APIC id +	 * distingush the cores.  	 */  	if (c->x86_max_cores > 1) {  		int cpu = smp_processor_id(); -		unsigned bits = 0; -		while ((1 << bits) < c->x86_max_cores) -			bits++; +		unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + +		if (bits == 0) { +			while ((1 << bits) < c->x86_max_cores) +				bits++; +		}  		cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1);  		phys_proc_id[cpu] >>= bits;  		printk(KERN_INFO "CPU %d(%d) -> Core %d\n", @@ -240,6 +242,8 @@ static void __init init_amd(struct cpuinfo_x86 *c)  	}  #endif +	if (cpuid_eax(0x80000000) >= 0x80000006) +		num_cache_leaves = 3;  }  static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 5386b29bb5a5..10afc645c540 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -122,6 +122,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  	select_idle_routine(c);  	l2 = init_intel_cacheinfo(c); +	if (c->cpuid_level > 9 ) { +		unsigned eax = cpuid_eax(10); +		/* Check for version and the number of counters */ +		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) +			set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); +	}  	/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */  	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index c8547a6fa7e6..6c37b4fd8ce2 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,6 +4,7 @@   *      Changes:   *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)   *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + *	Andi Kleen		: CPUID4 emulation on AMD.   */  #include <linux/init.h> @@ -130,25 +131,111 @@ struct _cpuid4_info {  	cpumask_t shared_cpu_map;  }; -static unsigned short			num_cache_leaves; +unsigned short			num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same +   information to the user.  This makes some assumptions about the machine: +   No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + +   In theory the TLBs could be reported as fake type (they are in "dummy"). +   Maybe later */ +union l1_cache { +	struct { +		unsigned line_size : 8; +		unsigned lines_per_tag : 8; +		unsigned assoc : 8; +		unsigned size_in_kb : 8; +	}; +	unsigned val; +}; + +union l2_cache { +	struct { +		unsigned line_size : 8; +		unsigned lines_per_tag : 4; +		unsigned assoc : 4; +		unsigned size_in_kb : 16; +	}; +	unsigned val; +}; + +static unsigned short assocs[] = { +	[1] = 1, [2] = 2, [4] = 4, [6] = 8, +	[8] = 16, +	[0xf] = 0xffff // ?? +	}; +static unsigned char levels[] = { 1, 1, 2 }; +static unsigned char types[] = { 1, 2, 3 }; + +static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, +		       union _cpuid4_leaf_ebx *ebx, +		       union _cpuid4_leaf_ecx *ecx) +{ +	unsigned dummy; +	unsigned line_size, lines_per_tag, assoc, size_in_kb; +	union l1_cache l1i, l1d; +	union l2_cache l2; + +	eax->full = 0; +	ebx->full = 0; +	ecx->full = 0; + +	cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); +	cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); + +	if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) +		return; + +	eax->split.is_self_initializing = 1; +	eax->split.type = types[leaf]; +	eax->split.level = levels[leaf]; +	eax->split.num_threads_sharing = 0; +	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + +	if (leaf <= 1) { +		union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; +		assoc = l1->assoc; +		line_size = l1->line_size; +		lines_per_tag = l1->lines_per_tag; +		size_in_kb = l1->size_in_kb; +	} else { +		assoc = l2.assoc; +		line_size = l2.line_size; +		lines_per_tag = l2.lines_per_tag; +		/* cpu_data has errata corrections for K7 applied */ +		size_in_kb = current_cpu_data.x86_cache_size; +	} + +	if (assoc == 0xf) +		eax->split.is_fully_associative = 1; +	ebx->split.coherency_line_size = line_size - 1; +	ebx->split.ways_of_associativity = assocs[assoc] - 1; +	ebx->split.physical_line_partition = lines_per_tag - 1; +	ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / +		(ebx->split.ways_of_associativity + 1) - 1; +}  static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)  { -	unsigned int		eax, ebx, ecx, edx; -	union _cpuid4_leaf_eax	cache_eax; +	union _cpuid4_leaf_eax 	eax; +	union _cpuid4_leaf_ebx 	ebx; +	union _cpuid4_leaf_ecx 	ecx; +	unsigned		edx; -	cpuid_count(4, index, &eax, &ebx, &ecx, &edx); -	cache_eax.full = eax; -	if (cache_eax.split.type == CACHE_TYPE_NULL) +	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) +		amd_cpuid4(index, &eax, &ebx, &ecx); +	else +		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full,  &edx); +	if (eax.split.type == CACHE_TYPE_NULL)  		return -EIO; /* better error ? */ -	this_leaf->eax.full = eax; -	this_leaf->ebx.full = ebx; -	this_leaf->ecx.full = ecx; -	this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) * -		(this_leaf->ebx.split.coherency_line_size + 1) * -		(this_leaf->ebx.split.physical_line_partition + 1) * -		(this_leaf->ebx.split.ways_of_associativity + 1); +	this_leaf->eax = eax; +	this_leaf->ebx = ebx; +	this_leaf->ecx = ecx; +	this_leaf->size = (ecx.split.number_of_sets + 1) * +		(ebx.split.coherency_line_size + 1) * +		(ebx.split.physical_line_partition + 1) * +		(ebx.split.ways_of_associativity + 1);  	return 0;  } diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 13288d9793a7..48f0f62f781c 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -120,14 +120,9 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)  	return 1;  } -/* - * By using the NMI code instead of a vector we just sneak thru the - * word generator coming out with just what we want.  AND it does - * not matter if clustered_apic_mode is set or not. - */  static void smp_send_nmi_allbutself(void)  { -	send_IPI_allbutself(APIC_DM_NMI); +	send_IPI_allbutself(NMI_VECTOR);  }  static void nmi_shootdown_cpus(void) diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index cfc683f153b9..e6e4506e749a 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -48,6 +48,7 @@  #include <asm/smp.h>  #include <asm/page.h>  #include <asm/desc.h> +#include <asm/dwarf2.h>  #include "irq_vectors.h"  #define nr_syscalls ((syscall_table_size)/4) @@ -85,31 +86,67 @@ VM_MASK		= 0x00020000  #define SAVE_ALL \  	cld; \  	pushl %es; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	/*CFI_REL_OFFSET es, 0;*/\  	pushl %ds; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	/*CFI_REL_OFFSET ds, 0;*/\  	pushl %eax; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET eax, 0;\  	pushl %ebp; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET ebp, 0;\  	pushl %edi; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET edi, 0;\  	pushl %esi; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET esi, 0;\  	pushl %edx; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET edx, 0;\  	pushl %ecx; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET ecx, 0;\  	pushl %ebx; \ +	CFI_ADJUST_CFA_OFFSET 4;\ +	CFI_REL_OFFSET ebx, 0;\  	movl $(__USER_DS), %edx; \  	movl %edx, %ds; \  	movl %edx, %es;  #define RESTORE_INT_REGS \  	popl %ebx;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE ebx;\  	popl %ecx;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE ecx;\  	popl %edx;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE edx;\  	popl %esi;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE esi;\  	popl %edi;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE edi;\  	popl %ebp;	\ -	popl %eax +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE ebp;\ +	popl %eax;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	CFI_RESTORE eax  #define RESTORE_REGS	\  	RESTORE_INT_REGS; \  1:	popl %ds;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	/*CFI_RESTORE ds;*/\  2:	popl %es;	\ +	CFI_ADJUST_CFA_OFFSET -4;\ +	/*CFI_RESTORE es;*/\  .section .fixup,"ax";	\  3:	movl $0,(%esp);	\  	jmp 1b;		\ @@ -122,13 +159,43 @@ VM_MASK		= 0x00020000  	.long 2b,4b;	\  .previous +#define RING0_INT_FRAME \ +	CFI_STARTPROC simple;\ +	CFI_DEF_CFA esp, 3*4;\ +	/*CFI_OFFSET cs, -2*4;*/\ +	CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ +	CFI_STARTPROC simple;\ +	CFI_DEF_CFA esp, 4*4;\ +	/*CFI_OFFSET cs, -2*4;*/\ +	CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ +	CFI_STARTPROC simple;\ +	CFI_DEF_CFA esp, OLDESP-EBX;\ +	/*CFI_OFFSET cs, CS-OLDESP;*/\ +	CFI_OFFSET eip, EIP-OLDESP;\ +	/*CFI_OFFSET es, ES-OLDESP;*/\ +	/*CFI_OFFSET ds, DS-OLDESP;*/\ +	CFI_OFFSET eax, EAX-OLDESP;\ +	CFI_OFFSET ebp, EBP-OLDESP;\ +	CFI_OFFSET edi, EDI-OLDESP;\ +	CFI_OFFSET esi, ESI-OLDESP;\ +	CFI_OFFSET edx, EDX-OLDESP;\ +	CFI_OFFSET ecx, ECX-OLDESP;\ +	CFI_OFFSET ebx, EBX-OLDESP  ENTRY(ret_from_fork) +	CFI_STARTPROC  	pushl %eax +	CFI_ADJUST_CFA_OFFSET -4  	call schedule_tail  	GET_THREAD_INFO(%ebp)  	popl %eax +	CFI_ADJUST_CFA_OFFSET -4  	jmp syscall_exit +	CFI_ENDPROC  /*   * Return to user mode is not as complex as all this looks, @@ -139,6 +206,7 @@ ENTRY(ret_from_fork)  	# userspace resumption stub bypassing syscall exit tracing  	ALIGN +	RING0_PTREGS_FRAME  ret_from_exception:  	preempt_stop  ret_from_intr: @@ -171,20 +239,33 @@ need_resched:  	call preempt_schedule_irq  	jmp need_resched  #endif +	CFI_ENDPROC  /* SYSENTER_RETURN points to after the "sysenter" instruction in     the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */  	# sysenter call handler stub  ENTRY(sysenter_entry) +	CFI_STARTPROC simple +	CFI_DEF_CFA esp, 0 +	CFI_REGISTER esp, ebp  	movl TSS_sysenter_esp0(%esp),%esp  sysenter_past_esp:  	sti  	pushl $(__USER_DS) +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET ss, 0*/  	pushl %ebp +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET esp, 0  	pushfl +	CFI_ADJUST_CFA_OFFSET 4  	pushl $(__USER_CS) +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET cs, 0*/  	pushl $SYSENTER_RETURN +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET eip, 0  /*   * Load the potential sixth argument from user stack. @@ -199,6 +280,7 @@ sysenter_past_esp:  .previous  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	GET_THREAD_INFO(%ebp) @@ -219,11 +301,14 @@ sysenter_past_esp:  	xorl %ebp,%ebp  	sti  	sysexit +	CFI_ENDPROC  	# system call handler stub  ENTRY(system_call) +	RING0_INT_FRAME			# can't unwind into user space anyway  	pushl %eax			# save orig_eax +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	GET_THREAD_INFO(%ebp)  	testl $TF_MASK,EFLAGS(%esp) @@ -256,10 +341,12 @@ restore_all:  	movb CS(%esp), %al  	andl $(VM_MASK | (4 << 8) | 3), %eax  	cmpl $((4 << 8) | 3), %eax +	CFI_REMEMBER_STATE  	je ldt_ss			# returning to user-space with LDT SS  restore_nocheck:  	RESTORE_REGS  	addl $4, %esp +	CFI_ADJUST_CFA_OFFSET -4  1:	iret  .section .fixup,"ax"  iret_exc: @@ -273,6 +360,7 @@ iret_exc:  	.long 1b,iret_exc  .previous +	CFI_RESTORE_STATE  ldt_ss:  	larl OLDSS(%esp), %eax  	jnz restore_nocheck @@ -285,11 +373,13 @@ ldt_ss:  	 * CPUs, which we can try to work around to make  	 * dosemu and wine happy. */  	subl $8, %esp		# reserve space for switch16 pointer +	CFI_ADJUST_CFA_OFFSET 8  	cli  	movl %esp, %eax  	/* Set up the 16bit stack frame with switch32 pointer on top,  	 * and a switch16 pointer on top of the current frame. */  	call setup_x86_bogus_stack +	CFI_ADJUST_CFA_OFFSET -8	# frame has moved  	RESTORE_REGS  	lss 20+4(%esp), %esp	# switch to 16bit stack  1:	iret @@ -297,9 +387,11 @@ ldt_ss:  	.align 4  	.long 1b,iret_exc  .previous +	CFI_ENDPROC  	# perform work that needs to be done immediately before resumption  	ALIGN +	RING0_PTREGS_FRAME		# can't unwind into user space anyway  work_pending:  	testb $_TIF_NEED_RESCHED, %cl  	jz work_notifysig @@ -329,8 +421,10 @@ work_notifysig:				# deal with pending signals and  work_notifysig_v86:  #ifdef CONFIG_VM86  	pushl %ecx			# save ti_flags for do_notify_resume +	CFI_ADJUST_CFA_OFFSET 4  	call save_v86_state		# %eax contains pt_regs pointer  	popl %ecx +	CFI_ADJUST_CFA_OFFSET -4  	movl %eax, %esp  	xorl %edx, %edx  	call do_notify_resume @@ -363,19 +457,21 @@ syscall_exit_work:  	movl $1, %edx  	call do_syscall_trace  	jmp resume_userspace +	CFI_ENDPROC -	ALIGN +	RING0_INT_FRAME			# can't unwind into user space anyway  syscall_fault:  	pushl %eax			# save orig_eax +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	GET_THREAD_INFO(%ebp)  	movl $-EFAULT,EAX(%esp)  	jmp resume_userspace -	ALIGN  syscall_badsys:  	movl $-ENOSYS,EAX(%esp)  	jmp resume_userspace +	CFI_ENDPROC  #define FIXUP_ESPFIX_STACK \  	movl %esp, %eax; \ @@ -387,16 +483,21 @@ syscall_badsys:  	movl %eax, %esp;  #define UNWIND_ESPFIX_STACK \  	pushl %eax; \ +	CFI_ADJUST_CFA_OFFSET 4; \  	movl %ss, %eax; \  	/* see if on 16bit stack */ \  	cmpw $__ESPFIX_SS, %ax; \ -	jne 28f; \ -	movl $__KERNEL_DS, %edx; \ -	movl %edx, %ds; \ -	movl %edx, %es; \ +	je 28f; \ +27:	popl %eax; \ +	CFI_ADJUST_CFA_OFFSET -4; \ +.section .fixup,"ax"; \ +28:	movl $__KERNEL_DS, %eax; \ +	movl %eax, %ds; \ +	movl %eax, %es; \  	/* switch to 32bit stack */ \ -	FIXUP_ESPFIX_STACK \ -28:	popl %eax; +	FIXUP_ESPFIX_STACK; \ +	jmp 27b; \ +.previous  /*   * Build the entry stubs and pointer table with @@ -408,9 +509,14 @@ ENTRY(interrupt)  vector=0  ENTRY(irq_entries_start) +	RING0_INT_FRAME  .rept NR_IRQS  	ALIGN + .if vector +	CFI_ADJUST_CFA_OFFSET -4 + .endif  1:	pushl $vector-256 +	CFI_ADJUST_CFA_OFFSET 4  	jmp common_interrupt  .data  	.long 1b @@ -424,60 +530,99 @@ common_interrupt:  	movl %esp,%eax  	call do_IRQ  	jmp ret_from_intr +	CFI_ENDPROC  #define BUILD_INTERRUPT(name, nr)	\  ENTRY(name)				\ +	RING0_INT_FRAME;		\  	pushl $nr-256;			\ -	SAVE_ALL			\ +	CFI_ADJUST_CFA_OFFSET 4;	\ +	SAVE_ALL;			\  	movl %esp,%eax;			\  	call smp_/**/name;		\ -	jmp ret_from_intr; +	jmp ret_from_intr;	\ +	CFI_ENDPROC  /* The include is where all of the SMP etc. interrupts come from */  #include "entry_arch.h"  ENTRY(divide_error) +	RING0_INT_FRAME  	pushl $0			# no error code +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_divide_error +	CFI_ADJUST_CFA_OFFSET 4  	ALIGN  error_code:  	pushl %ds +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET ds, 0*/  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET eax, 0  	xorl %eax, %eax  	pushl %ebp +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ebp, 0  	pushl %edi +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET edi, 0  	pushl %esi +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET esi, 0  	pushl %edx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET edx, 0  	decl %eax			# eax = -1  	pushl %ecx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ecx, 0  	pushl %ebx +	CFI_ADJUST_CFA_OFFSET 4 +	CFI_REL_OFFSET ebx, 0  	cld  	pushl %es +	CFI_ADJUST_CFA_OFFSET 4 +	/*CFI_REL_OFFSET es, 0*/  	UNWIND_ESPFIX_STACK  	popl %ecx +	CFI_ADJUST_CFA_OFFSET -4 +	/*CFI_REGISTER es, ecx*/  	movl ES(%esp), %edi		# get the function address  	movl ORIG_EAX(%esp), %edx	# get the error code  	movl %eax, ORIG_EAX(%esp)  	movl %ecx, ES(%esp) +	/*CFI_REL_OFFSET es, ES*/  	movl $(__USER_DS), %ecx  	movl %ecx, %ds  	movl %ecx, %es  	movl %esp,%eax			# pt_regs pointer  	call *%edi  	jmp ret_from_exception +	CFI_ENDPROC  ENTRY(coprocessor_error) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_coprocessor_error +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(simd_coprocessor_error) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_simd_coprocessor_error +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(device_not_available) +	RING0_INT_FRAME  	pushl $-1			# mark this as an int +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	movl %cr0, %eax  	testl $0x4, %eax		# EM (math emulation bit) @@ -487,9 +632,12 @@ ENTRY(device_not_available)  	jmp ret_from_exception  device_not_available_emulate:  	pushl $0			# temporary storage for ORIG_EIP +	CFI_ADJUST_CFA_OFFSET 4  	call math_emulate  	addl $4, %esp +	CFI_ADJUST_CFA_OFFSET -4  	jmp ret_from_exception +	CFI_ENDPROC  /*   * Debug traps and NMI can happen at the one SYSENTER instruction @@ -514,16 +662,19 @@ label:						\  	pushl $sysenter_past_esp  KPROBE_ENTRY(debug) +	RING0_INT_FRAME  	cmpl $sysenter_entry,(%esp)  	jne debug_stack_correct  	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)  debug_stack_correct:  	pushl $-1			# mark this as an int +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	xorl %edx,%edx			# error code 0  	movl %esp,%eax			# pt_regs pointer  	call do_debug  	jmp ret_from_exception +	CFI_ENDPROC  	.previous .text  /*   * NMI is doubly nasty. It can happen _while_ we're handling @@ -534,14 +685,18 @@ debug_stack_correct:   * fault happened on the sysenter path.   */  ENTRY(nmi) +	RING0_INT_FRAME  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4  	movl %ss, %eax  	cmpw $__ESPFIX_SS, %ax  	popl %eax +	CFI_ADJUST_CFA_OFFSET -4  	je nmi_16bit_stack  	cmpl $sysenter_entry,(%esp)  	je nmi_stack_fixup  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4  	movl %esp,%eax  	/* Do not access memory above the end of our stack page,  	 * it might not exist. @@ -549,16 +704,19 @@ ENTRY(nmi)  	andl $(THREAD_SIZE-1),%eax  	cmpl $(THREAD_SIZE-20),%eax  	popl %eax +	CFI_ADJUST_CFA_OFFSET -4  	jae nmi_stack_correct  	cmpl $sysenter_entry,12(%esp)  	je nmi_debug_stack_check  nmi_stack_correct:  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	xorl %edx,%edx		# zero error code  	movl %esp,%eax		# pt_regs pointer  	call do_nmi  	jmp restore_all +	CFI_ENDPROC  nmi_stack_fixup:  	FIX_STACK(12,nmi_stack_correct, 1) @@ -574,94 +732,177 @@ nmi_debug_stack_check:  	jmp nmi_stack_correct  nmi_16bit_stack: +	RING0_INT_FRAME  	/* create the pointer to lss back */  	pushl %ss +	CFI_ADJUST_CFA_OFFSET 4  	pushl %esp +	CFI_ADJUST_CFA_OFFSET 4  	movzwl %sp, %esp  	addw $4, (%esp)  	/* copy the iret frame of 12 bytes */  	.rept 3  	pushl 16(%esp) +	CFI_ADJUST_CFA_OFFSET 4  	.endr  	pushl %eax +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	FIXUP_ESPFIX_STACK		# %eax == %esp +	CFI_ADJUST_CFA_OFFSET -20	# the frame has now moved  	xorl %edx,%edx			# zero error code  	call do_nmi  	RESTORE_REGS  	lss 12+4(%esp), %esp		# back to 16bit stack  1:	iret +	CFI_ENDPROC  .section __ex_table,"a"  	.align 4  	.long 1b,iret_exc  .previous  KPROBE_ENTRY(int3) +	RING0_INT_FRAME  	pushl $-1			# mark this as an int +	CFI_ADJUST_CFA_OFFSET 4  	SAVE_ALL  	xorl %edx,%edx		# zero error code  	movl %esp,%eax		# pt_regs pointer  	call do_int3  	jmp ret_from_exception +	CFI_ENDPROC  	.previous .text  ENTRY(overflow) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_overflow +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(bounds) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_bounds +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(invalid_op) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_invalid_op +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(coprocessor_segment_overrun) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_coprocessor_segment_overrun +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(invalid_TSS) +	RING0_EC_FRAME  	pushl $do_invalid_TSS +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(segment_not_present) +	RING0_EC_FRAME  	pushl $do_segment_not_present +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  ENTRY(stack_segment) +	RING0_EC_FRAME  	pushl $do_stack_segment +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  KPROBE_ENTRY(general_protection) +	RING0_EC_FRAME  	pushl $do_general_protection +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  	.previous .text  ENTRY(alignment_check) +	RING0_EC_FRAME  	pushl $do_alignment_check +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  KPROBE_ENTRY(page_fault) +	RING0_EC_FRAME  	pushl $do_page_fault +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  	.previous .text  #ifdef CONFIG_X86_MCE  ENTRY(machine_check) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl machine_check_vector +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC  #endif  ENTRY(spurious_interrupt_bug) +	RING0_INT_FRAME  	pushl $0 +	CFI_ADJUST_CFA_OFFSET 4  	pushl $do_spurious_interrupt_bug +	CFI_ADJUST_CFA_OFFSET 4  	jmp error_code +	CFI_ENDPROC + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) +	CFI_STARTPROC +	movl	4(%esp), %edx +	movl	(%esp), %ecx +	leal	4(%esp), %eax +	movl	%ebx, EBX(%edx) +	xorl	%ebx, %ebx +	movl	%ebx, ECX(%edx) +	movl	%ebx, EDX(%edx) +	movl	%esi, ESI(%edx) +	movl	%edi, EDI(%edx) +	movl	%ebp, EBP(%edx) +	movl	%ebx, EAX(%edx) +	movl	$__USER_DS, DS(%edx) +	movl	$__USER_DS, ES(%edx) +	movl	%ebx, ORIG_EAX(%edx) +	movl	%ecx, EIP(%edx) +	movl	12(%esp), %ecx +	movl	$__KERNEL_CS, CS(%edx) +	movl	%ebx, EFLAGS(%edx) +	movl	%eax, OLDESP(%edx) +	movl	8(%esp), %eax +	movl	%ecx, 8(%esp) +	movl	EBX(%edx), %ebx +	movl	$__KERNEL_DS, OLDSS(%edx) +	jmpl	*%eax +	CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif  .section .rodata,"a"  #include "syscall_table.S" diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c new file mode 100644 index 000000000000..c6737c35815d --- /dev/null +++ b/arch/i386/kernel/hpet.c @@ -0,0 +1,67 @@ +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/hpet.h> +#include <linux/init.h> + +#include <asm/hpet.h> +#include <asm/io.h> + +#define HPET_MASK	CLOCKSOURCE_MASK(32) +#define HPET_SHIFT	22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC	1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ +	return (cycle_t)readl(hpet_ptr); +} + +static struct clocksource clocksource_hpet = { +	.name		= "hpet", +	.rating		= 250, +	.read		= read_hpet, +	.mask		= HPET_MASK, +	.mult		= 0, /* set below */ +	.shift		= HPET_SHIFT, +	.is_continuous	= 1, +}; + +static int __init init_hpet_clocksource(void) +{ +	unsigned long hpet_period; +	void __iomem* hpet_base; +	u64 tmp; + +	if (!hpet_address) +		return -ENODEV; + +	/* calculate the hpet address: */ +	hpet_base = +		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); +	hpet_ptr = hpet_base + HPET_COUNTER; + +	/* calculate the frequency: */ +	hpet_period = readl(hpet_base + HPET_PERIOD); + +	/* +	 * hpet period is in femto seconds per cycle +	 * so we need to convert this to ns/cyc units +	 * aproximated by mult/2^shift +	 * +	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift +	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult +	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult +	 *  (fsec/cyc << shift)/1000000 = mult +	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult +	 */ +	tmp = (u64)hpet_period << HPET_SHIFT; +	do_div(tmp, FSEC_PER_NSEC); +	clocksource_hpet.mult = (u32)tmp; + +	return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c new file mode 100644 index 000000000000..477b24daff53 --- /dev/null +++ b/arch/i386/kernel/i8253.c @@ -0,0 +1,118 @@ +/* + * i8253.c  8253/PIT functions + * + */ +#include <linux/clocksource.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/sysdev.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/smp.h> +#include <asm/delay.h> +#include <asm/i8253.h> +#include <asm/io.h> + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +void setup_pit_timer(void) +{ +	unsigned long flags; + +	spin_lock_irqsave(&i8253_lock, flags); +	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */ +	udelay(10); +	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */ +	udelay(10); +	outb(LATCH >> 8 , PIT_CH0);	/* MSB */ +	spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ +	unsigned long flags; +	int count; +	u32 jifs; +	static int old_count; +	static u32 old_jifs; + +	spin_lock_irqsave(&i8253_lock, flags); +        /* +	 * Although our caller may have the read side of xtime_lock, +	 * this is now a seqlock, and we are cheating in this routine +	 * by having side effects on state that we cannot undo if +	 * there is a collision on the seqlock and our caller has to +	 * retry.  (Namely, old_jifs and old_count.)  So we must treat +	 * jiffies as volatile despite the lock.  We read jiffies +	 * before latching the timer count to guarantee that although +	 * the jiffies value might be older than the count (that is, +	 * the counter may underflow between the last point where +	 * jiffies was incremented and the point where we latch the +	 * count), it cannot be newer. +	 */ +	jifs = jiffies; +	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */ +	count = inb_p(PIT_CH0);	/* read the latched count */ +	count |= inb_p(PIT_CH0) << 8; + +	/* VIA686a test code... reset the latch if count > max + 1 */ +	if (count > LATCH) { +		outb_p(0x34, PIT_MODE); +		outb_p(LATCH & 0xff, PIT_CH0); +		outb(LATCH >> 8, PIT_CH0); +		count = LATCH - 1; +	} + +	/* +	 * It's possible for count to appear to go the wrong way for a +	 * couple of reasons: +	 * +	 *  1. The timer counter underflows, but we haven't handled the +	 *     resulting interrupt and incremented jiffies yet. +	 *  2. Hardware problem with the timer, not giving us continuous time, +	 *     the counter does small "jumps" upwards on some Pentium systems, +	 *     (see c't 95/10 page 335 for Neptun bug.) +	 * +	 * Previous attempts to handle these cases intelligently were +	 * buggy, so we just do the simple thing now. +	 */ +	if (count > old_count && jifs == old_jifs) { +		count = old_count; +	} +	old_count = count; +	old_jifs = jifs; + +	spin_unlock_irqrestore(&i8253_lock, flags); + +	count = (LATCH - 1) - count; + +	return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { +	.name	= "pit", +	.rating = 110, +	.read	= pit_read, +	.mask	= CLOCKSOURCE_MASK(32), +	.mult	= 0, +	.shift	= 20, +}; + +static int __init init_pit_clocksource(void) +{ +	if (num_possible_cpus() > 4) /* PIT does not scale! */ +		return 0; + +	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); +	return clocksource_register(&clocksource_pit); +} +module_init(init_pit_clocksource); diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index a62df3e764c5..72ae414e4d49 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -38,6 +38,7 @@  #include <asm/desc.h>  #include <asm/timer.h>  #include <asm/i8259.h> +#include <asm/nmi.h>  #include <mach_apic.h> @@ -50,6 +51,7 @@ atomic_t irq_mis_count;  static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };  static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock);  int timer_over_8254 __initdata = 1; @@ -1161,10 +1163,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };  int assign_irq_vector(int irq)  {  	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; +	unsigned long flags; +	int vector; + +	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); -	BUG_ON(irq >= NR_IRQ_VECTORS); -	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) +	spin_lock_irqsave(&vector_lock, flags); + +	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { +		spin_unlock_irqrestore(&vector_lock, flags);  		return IO_APIC_VECTOR(irq); +	}  next:  	current_vector += 8;  	if (current_vector == SYSCALL_VECTOR) @@ -1172,16 +1181,21 @@ next:  	if (current_vector >= FIRST_SYSTEM_VECTOR) {  		offset++; -		if (!(offset%8)) +		if (!(offset%8)) { +			spin_unlock_irqrestore(&vector_lock, flags);  			return -ENOSPC; +		}  		current_vector = FIRST_DEVICE_VECTOR + offset;  	} -	vector_irq[current_vector] = irq; +	vector = current_vector; +	vector_irq[vector] = irq;  	if (irq != AUTO_ASSIGN) -		IO_APIC_VECTOR(irq) = current_vector; +		IO_APIC_VECTOR(irq) = vector; -	return current_vector; +	spin_unlock_irqrestore(&vector_lock, flags); + +	return vector;  }  static struct hw_interrupt_type ioapic_level_type; @@ -1193,21 +1207,14 @@ static struct hw_interrupt_type ioapic_edge_type;  static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)  { -	if (use_pci_vector() && !platform_legacy_irq(irq)) { -		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -				trigger == IOAPIC_LEVEL) -			irq_desc[vector].handler = &ioapic_level_type; -		else -			irq_desc[vector].handler = &ioapic_edge_type; -		set_intr_gate(vector, interrupt[vector]); -	} else	{ -		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -				trigger == IOAPIC_LEVEL) -			irq_desc[irq].handler = &ioapic_level_type; -		else -			irq_desc[irq].handler = &ioapic_edge_type; -		set_intr_gate(vector, interrupt[irq]); -	} +	unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + +	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || +			trigger == IOAPIC_LEVEL) +		irq_desc[idx].handler = &ioapic_level_type; +	else +		irq_desc[idx].handler = &ioapic_edge_type; +	set_intr_gate(vector, interrupt[idx]);  }  static void __init setup_IO_APIC_irqs(void) diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 49ce4c31b713..061533e0cb5e 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -227,7 +227,7 @@ int show_interrupts(struct seq_file *p, void *v)  	if (i == 0) {  		seq_printf(p, "           ");  		for_each_online_cpu(j) -			seq_printf(p, "CPU%d       ",j); +			seq_printf(p, "CPU%-8d",j);  		seq_putc(p, '\n');  	} diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 395a9a6dff88..727e419ad78a 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -57,34 +57,85 @@ static __always_inline void set_jmp_op(void *from, void *to)  /*   * returns non-zero if opcodes can be boosted.   */ -static __always_inline int can_boost(kprobe_opcode_t opcode) +static __always_inline int can_boost(kprobe_opcode_t *opcodes)  { -	switch (opcode & 0xf0 ) { +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf)		      \ +	(((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \ +	  (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \ +	  (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \ +	  (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \ +	 << (row % 32)) +	/* +	 * Undefined/reserved opcodes, conditional jump, Opcode Extension +	 * Groups, and some special opcodes can not be boost. +	 */ +	static const unsigned long twobyte_is_boostable[256 / 32] = { +		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */ +		/*      -------------------------------         */ +		W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ +		W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ +		W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ +		W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ +		W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ +		W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ +		W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ +		W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ +		W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ +		W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ +		W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ +		W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ +		W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ +		W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ +		W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ +		W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0)  /* f0 */ +		/*      -------------------------------         */ +		/*      0 1 2 3 4 5 6 7 8 9 a b c d e f         */ +	}; +#undef W +	kprobe_opcode_t opcode; +	kprobe_opcode_t *orig_opcodes = opcodes; +retry: +	if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) +		return 0; +	opcode = *(opcodes++); + +	/* 2nd-byte opcode */ +	if (opcode == 0x0f) { +		if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) +			return 0; +		return test_bit(*opcodes, twobyte_is_boostable); +	} + +	switch (opcode & 0xf0) { +	case 0x60: +		if (0x63 < opcode && opcode < 0x67) +			goto retry; /* prefixes */ +		/* can't boost Address-size override and bound */ +		return (opcode != 0x62 && opcode != 0x67);  	case 0x70:  		return 0; /* can't boost conditional jump */ -	case 0x90: -		/* can't boost call and pushf */ -		return opcode != 0x9a && opcode != 0x9c;  	case 0xc0: -		/* can't boost undefined opcodes and soft-interruptions */ -		return (0xc1 < opcode && opcode < 0xc6) || -			(0xc7 < opcode && opcode < 0xcc) || opcode == 0xcf; +		/* can't boost software-interruptions */ +		return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;  	case 0xd0:  		/* can boost AA* and XLAT */  		return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);  	case 0xe0: -		/* can boost in/out and (may be) jmps */ -		return (0xe3 < opcode && opcode != 0xe8); +		/* can boost in/out and absolute jmps */ +		return ((opcode & 0x04) || opcode == 0xea);  	case 0xf0: +		if ((opcode & 0x0c) == 0 && opcode != 0xf1) +			goto retry; /* lock/rep(ne) prefix */  		/* clear and set flags can be boost */  		return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));  	default: -		/* currently, can't boost 2 bytes opcodes */ -		return opcode != 0x0f; +		if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) +			goto retry; /* prefixes */ +		/* can't boost CS override and call */ +		return (opcode != 0x2e && opcode != 0x9a);  	}  } -  /*   * returns non-zero if opcode modifies the interrupt flag.   */ @@ -109,7 +160,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)  	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));  	p->opcode = *p->addr; -	if (can_boost(p->opcode)) { +	if (can_boost(p->addr)) {  		p->ainsn.boostable = 0;  	} else {  		p->ainsn.boostable = -1; @@ -208,7 +259,9 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  	struct kprobe_ctlblk *kcb;  #ifdef CONFIG_PREEMPT  	unsigned pre_preempt_count = preempt_count(); -#endif /* CONFIG_PREEMPT */ +#else +	unsigned pre_preempt_count = 1; +#endif  	addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); @@ -285,22 +338,14 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  		/* handler has already set things up, so skip ss setup */  		return 1; -	if (p->ainsn.boostable == 1 && -#ifdef CONFIG_PREEMPT -	    !(pre_preempt_count) && /* -				       * This enables booster when the direct -				       * execution path aren't preempted. -				       */ -#endif /* CONFIG_PREEMPT */ -	    !p->post_handler && !p->break_handler ) { +ss_probe: +	if (pre_preempt_count && p->ainsn.boostable == 1 && !p->post_handler){  		/* Boost up -- we can execute copied instructions directly */  		reset_current_kprobe();  		regs->eip = (unsigned long)p->ainsn.insn;  		preempt_enable_no_resched();  		return 1;  	} - -ss_probe:  	prepare_singlestep(p, regs);  	kcb->kprobe_status = KPROBE_HIT_SS;  	return 1; diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d43b498ec745..a76e93146585 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -14,21 +14,17 @@   */  #include <linux/config.h> -#include <linux/mm.h>  #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h>  #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h>  #include <linux/module.h>  #include <linux/nmi.h>  #include <linux/sysdev.h>  #include <linux/sysctl.h> +#include <linux/percpu.h>  #include <asm/smp.h> -#include <asm/div64.h>  #include <asm/nmi.h> +#include <asm/intel_arch_perfmon.h>  #include "mach_traps.h" @@ -100,6 +96,9 @@ int nmi_active;  	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\  	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK +  #ifdef CONFIG_SMP  /* The performance counters used by NMI_LOCAL_APIC don't trigger when   * the CPU is idle. To make sure the NMI watchdog really ticks on all @@ -212,6 +211,8 @@ static int __init setup_nmi_watchdog(char *str)  __setup("nmi_watchdog=", setup_nmi_watchdog); +static void disable_intel_arch_watchdog(void); +  static void disable_lapic_nmi_watchdog(void)  {  	if (nmi_active <= 0) @@ -221,6 +222,10 @@ static void disable_lapic_nmi_watchdog(void)  		wrmsr(MSR_K7_EVNTSEL0, 0, 0);  		break;  	case X86_VENDOR_INTEL: +		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { +			disable_intel_arch_watchdog(); +			break; +		}  		switch (boot_cpu_data.x86) {  		case 6:  			if (boot_cpu_data.x86_model > 0xd) @@ -449,6 +454,53 @@ static int setup_p4_watchdog(void)  	return 1;  } +static void disable_intel_arch_watchdog(void) +{ +	unsigned ebx; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Unhalted Core Cycles Event or not. +	 * NOTE: Corresponding bit = 0 in ebp indicates event present. +	 */ +	ebx = cpuid_ebx(10); +	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) +		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); +} + +static int setup_intel_arch_watchdog(void) +{ +	unsigned int evntsel; +	unsigned ebx; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Unhalted Core Cycles Event or not. +	 * NOTE: Corresponding bit = 0 in ebp indicates event present. +	 */ +	ebx = cpuid_ebx(10); +	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) +		return 0; + +	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + +	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); +	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); + +	evntsel = ARCH_PERFMON_EVENTSEL_INT +		| ARCH_PERFMON_EVENTSEL_OS +		| ARCH_PERFMON_EVENTSEL_USR +		| ARCH_PERFMON_NMI_EVENT_SEL +		| ARCH_PERFMON_NMI_EVENT_UMASK; + +	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); +	write_watchdog_counter("INTEL_ARCH_PERFCTR0"); +	apic_write(APIC_LVTPC, APIC_DM_NMI); +	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; +	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); +	return 1; +} +  void setup_apic_nmi_watchdog (void)  {  	switch (boot_cpu_data.x86_vendor) { @@ -458,6 +510,11 @@ void setup_apic_nmi_watchdog (void)  		setup_k7_watchdog();  		break;  	case X86_VENDOR_INTEL: +		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { +			if (!setup_intel_arch_watchdog()) +				return; +			break; +		}  		switch (boot_cpu_data.x86) {  		case 6:  			if (boot_cpu_data.x86_model > 0xd) @@ -561,7 +618,8 @@ void nmi_watchdog_tick (struct pt_regs * regs)  			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);  			apic_write(APIC_LVTPC, APIC_DM_NMI);  		} -		else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { +		else if (nmi_perfctr_msr == MSR_P6_PERFCTR0 || +		         nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {  			/* Only P6 based Pentium M need to re-unmask  			 * the apic vector but it doesn't hurt  			 * other P6 variant */ diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index 5f5b075f860a..0caf14652bad 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -79,10 +79,12 @@ int __init get_memcfg_numaq(void)  	return 1;  } -static int __init numaq_dsc_disable(void) +static int __init numaq_tsc_disable(void)  { -	printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); -	tsc_disable = 1; +	if (num_online_nodes() > 1) { +		printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); +		tsc_disable = 1; +	}  	return 0;  } -core_initcall(numaq_dsc_disable); +arch_initcall(numaq_tsc_disable); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 6259afea46d1..6946b06e2784 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -102,7 +102,7 @@ void default_idle(void)  	local_irq_enable();  	if (!hlt_counter && boot_cpu_data.hlt_works_ok) { -		clear_thread_flag(TIF_POLLING_NRFLAG); +		current_thread_info()->status &= ~TS_POLLING;  		smp_mb__after_clear_bit();  		while (!need_resched()) {  			local_irq_disable(); @@ -111,7 +111,7 @@ void default_idle(void)  			else  				local_irq_enable();  		} -		set_thread_flag(TIF_POLLING_NRFLAG); +		current_thread_info()->status |= TS_POLLING;  	} else {  		while (!need_resched())  			cpu_relax(); @@ -174,7 +174,7 @@ void cpu_idle(void)  {  	int cpu = smp_processor_id(); -	set_thread_flag(TIF_POLLING_NRFLAG); +	current_thread_info()->status |= TS_POLLING;  	/* endless idle loop with no priority at all */  	while (1) { @@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs)  	cr3 = read_cr3();  	cr4 = read_cr4_safe();  	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); -	show_trace(NULL, ®s->esp); +	show_trace(NULL, regs, ®s->esp);  }  /* diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 6bef9273733e..4a65040cc624 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -1575,6 +1575,7 @@ void __init setup_arch(char **cmdline_p)  	conswitchp = &dummy_con;  #endif  #endif +	tsc_init();  }  static __init int add_pcspkr(void) diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index d134e9643a58..c10789d7a9d3 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m  static inline int __prepare_ICR (unsigned int shortcut, int vector)  { -	return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; +	unsigned int icr = shortcut | APIC_DEST_LOGICAL; + +	switch (vector) { +	default: +		icr |= APIC_DM_FIXED | vector; +		break; +	case NMI_VECTOR: +		icr |= APIC_DM_NMI; +		break; +	} +	return icr;  }  static inline int __prepare_ICR2 (unsigned int mask) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index bd0ca5c9f053..bce5470ecb42 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -52,6 +52,7 @@  #include <asm/tlbflush.h>  #include <asm/desc.h>  #include <asm/arch_hooks.h> +#include <asm/nmi.h>  #include <mach_apic.h>  #include <mach_wakecpu.h> diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 9d3074759856..5f43d0410122 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -82,13 +82,6 @@ extern unsigned long wall_jiffies;  DEFINE_SPINLOCK(rtc_lock);  EXPORT_SYMBOL(rtc_lock); -#include <asm/i8253.h> - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; -  /*   * This is a special lock that is owned by the CPU and holds the index   * register we are working with.  It is required for NMI access to the @@ -118,99 +111,19 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)  }  EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ -	unsigned long seq; -	unsigned long usec, sec; -	unsigned long max_ntp_tick; - -	do { -		unsigned long lost; - -		seq = read_seqbegin(&xtime_lock); - -		usec = cur_timer->get_offset(); -		lost = jiffies - wall_jiffies; - -		/* -		 * If time_adjust is negative then NTP is slowing the clock -		 * so make sure not to go into next possible interval. -		 * Better to lose some accuracy than have time go backwards.. -		 */ -		if (unlikely(time_adjust < 0)) { -			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; -			usec = min(usec, max_ntp_tick); - -			if (lost) -				usec += lost * max_ntp_tick; -		} -		else if (unlikely(lost)) -			usec += lost * (USEC_PER_SEC / HZ); - -		sec = xtime.tv_sec; -		usec += (xtime.tv_nsec / 1000); -	} while (read_seqretry(&xtime_lock, seq)); - -	while (usec >= 1000000) { -		usec -= 1000000; -		sec++; -	} - -	tv->tv_sec = sec; -	tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ -	time_t wtm_sec, sec = tv->tv_sec; -	long wtm_nsec, nsec = tv->tv_nsec; - -	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) -		return -EINVAL; - -	write_seqlock_irq(&xtime_lock); -	/* -	 * This is revolting. We need to set "xtime" correctly. However, the -	 * value in this location is the value at the most recent update of -	 * wall time.  Discover what correction gettimeofday() would have -	 * made, and then undo it! -	 */ -	nsec -= cur_timer->get_offset() * NSEC_PER_USEC; -	nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - -	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); -	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - -	set_normalized_timespec(&xtime, sec, nsec); -	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - -	ntp_clear(); -	write_sequnlock_irq(&xtime_lock); -	clock_was_set(); -	return 0; -} - -EXPORT_SYMBOL(do_settimeofday); -  static int set_rtc_mmss(unsigned long nowtime)  {  	int retval; - -	WARN_ON(irqs_disabled()); +	unsigned long flags;  	/* gets recalled with irq locally disabled */ -	spin_lock_irq(&rtc_lock); +	/* XXX - does irqsave resolve this? -johnstul */ +	spin_lock_irqsave(&rtc_lock, flags);  	if (efi_enabled)  		retval = efi_set_rtc_mmss(nowtime);  	else  		retval = mach_set_rtc_mmss(nowtime); -	spin_unlock_irq(&rtc_lock); +	spin_unlock_irqrestore(&rtc_lock, flags);  	return retval;  } @@ -218,16 +131,6 @@ static int set_rtc_mmss(unsigned long nowtime)  int timer_ack; -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - *		Note: This function is required to return accurate - *		time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ -	return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); -  #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)  unsigned long profile_pc(struct pt_regs *regs)  { @@ -242,11 +145,21 @@ EXPORT_SYMBOL(profile_pc);  #endif  /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly.   */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)  { +	/* +	 * Here we are in the timer irq handler. We just have irqs locally +	 * disabled but we don't know if the timer_bh is running on the other +	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need +	 * the irq version of write_lock because as just said we have irq +	 * locally disabled. -arca +	 */ +	write_seqlock(&xtime_lock); +  #ifdef CONFIG_X86_IO_APIC  	if (timer_ack) {  		/* @@ -279,27 +192,6 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)  		irq = inb_p( 0x61 );	/* read the current state */  		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */  	} -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ -	/* -	 * Here we are in the timer irq handler. We just have irqs locally -	 * disabled but we don't know if the timer_bh is running on the other -	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need -	 * the irq version of write_lock because as just said we have irq -	 * locally disabled. -arca -	 */ -	write_seqlock(&xtime_lock); - -	cur_timer->mark_offset(); -  -	do_timer_interrupt(irq, regs);  	write_sequnlock(&xtime_lock); @@ -380,7 +272,6 @@ void notify_arch_cmos_timer(void)  static long clock_cmos_diff, sleep_start; -static struct timer_opts *last_timer;  static int timer_suspend(struct sys_device *dev, pm_message_t state)  {  	/* @@ -389,10 +280,6 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state)  	clock_cmos_diff = -get_cmos_time();  	clock_cmos_diff += get_seconds();  	sleep_start = get_cmos_time(); -	last_timer = cur_timer; -	cur_timer = &timer_none; -	if (last_timer->suspend) -		last_timer->suspend(state);  	return 0;  } @@ -415,10 +302,6 @@ static int timer_resume(struct sys_device *dev)  	jiffies_64 += sleep_length;  	wall_jiffies += sleep_length;  	write_sequnlock_irqrestore(&xtime_lock, flags); -	if (last_timer->resume) -		last_timer->resume(); -	cur_timer = last_timer; -	last_timer = NULL;  	touch_softlockup_watchdog();  	return 0;  } @@ -460,9 +343,6 @@ static void __init hpet_time_init(void)  		printk("Using HPET for base-timer\n");  	} -	cur_timer = select_timer(); -	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); -  	time_init_hook();  }  #endif @@ -484,8 +364,5 @@ void __init time_init(void)  	set_normalized_timespec(&wall_to_monotonic,  		-xtime.tv_sec, -xtime.tv_nsec); -	cur_timer = select_timer(); -	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); -  	time_init_hook();  } diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile deleted file mode 100644 index 8fa12be658dd..000000000000 --- a/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER)	+= timer_cyclone.o -obj-$(CONFIG_HPET_TIMER)	+= timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER)	+= timer_pm.o diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c deleted file mode 100644 index 8163fe0cf1f0..000000000000 --- a/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - *	Common functions used across the timers go here - */ - -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/jiffies.h> -#include <linux/module.h> - -#include <asm/io.h> -#include <asm/timer.h> -#include <asm/hpet.h> - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME	(5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ -	mach_prepare_counter(); - -	{ -		unsigned long startlow, starthigh; -		unsigned long endlow, endhigh; -		unsigned long count; - -		rdtsc(startlow,starthigh); -		mach_countup(&count); -		rdtsc(endlow,endhigh); - - -		/* Error: ECTCNEVERSET */ -		if (count <= 1) -			goto bad_ctc; - -		/* 64-bit subtract - gcc just messes up with long longs */ -		__asm__("subl %2,%0\n\t" -			"sbbl %3,%1" -			:"=a" (endlow), "=d" (endhigh) -			:"g" (startlow), "g" (starthigh), -			 "0" (endlow), "1" (endhigh)); - -		/* Error: ECPUTOOFAST */ -		if (endhigh) -			goto bad_ctc; - -		/* Error: ECPUTOOSLOW */ -		if (endlow <= CALIBRATE_TIME) -			goto bad_ctc; - -		__asm__("divl %2" -			:"=a" (endlow), "=d" (endhigh) -			:"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - -		return endlow; -	} - -	/* -	 * The CTC wasn't reliable: we got a hit on the very first read, -	 * or the CPU was so fast/slow that the quotient wouldn't fit in -	 * 32 bits.. -	 */ -bad_ctc: -	return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET 	(5 * hpet_tick) -#define CALIBRATE_TIME_HPET 	(5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ -	unsigned long tsc_startlow, tsc_starthigh; -	unsigned long tsc_endlow, tsc_endhigh; -	unsigned long hpet_start, hpet_end; -	unsigned long result, remain; - -	hpet_start = hpet_readl(HPET_COUNTER); -	rdtsc(tsc_startlow, tsc_starthigh); -	do { -		hpet_end = hpet_readl(HPET_COUNTER); -	} while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); -	rdtsc(tsc_endlow, tsc_endhigh); - -	/* 64-bit subtract - gcc just messes up with long longs */ -	__asm__("subl %2,%0\n\t" -		"sbbl %3,%1" -		:"=a" (tsc_endlow), "=d" (tsc_endhigh) -		:"g" (tsc_startlow), "g" (tsc_starthigh), -		 "0" (tsc_endlow), "1" (tsc_endhigh)); - -	/* Error: ECPUTOOFAST */ -	if (tsc_endhigh) -		goto bad_calibration; - -	/* Error: ECPUTOOSLOW */ -	if (tsc_endlow <= CALIBRATE_TIME_HPET) -		goto bad_calibration; - -	ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); -	if (remain > (tsc_endlow >> 1)) -		result++; /* rounding the result */ - -	if (tsc_hpet_quotient_ptr) { -		unsigned long tsc_hpet_quotient; - -		ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, -			CALIBRATE_CNT_HPET); -		if (remain > (tsc_endlow >> 1)) -			tsc_hpet_quotient++; /* rounding the result */ -		*tsc_hpet_quotient_ptr = tsc_hpet_quotient; -	} - -	return result; -bad_calibration: -	/* -	 * the CPU was so fast/slow that the quotient wouldn't fit in -	 * 32 bits.. -	 */ -	return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ -	unsigned long retval; -	rdtscl(retval); -	return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ -	if (cpu_has_tsc) { -		unsigned long tsc_quotient = calibrate_tsc(); -		if (tsc_quotient) { -			/* report CPU clock rate in Hz. -			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = -			 * clock/second. Our precision is about 100 ppm. -			 */ -			{	unsigned long eax=0, edx=1000; -				__asm__("divl %2" -		       		:"=a" (cpu_khz), "=d" (edx) -        	       		:"r" (tsc_quotient), -	                	"0" (eax), "1" (edx)); -				printk("Detected %u.%03u MHz processor.\n", -					cpu_khz / 1000, cpu_khz % 1000); -			} -		} -	} -} - diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c deleted file mode 100644 index 7e39ed8e33f8..000000000000 --- a/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <asm/timer.h> - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER -	&timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER -	&timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER -	&timer_pmtmr_init, -#endif -	&timer_tsc_init, -	&timer_pit_init, -	NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ -	if (str) -		strlcpy(clock_override, str, sizeof(clock_override)); -	return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ -	cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first  - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ -	int i = 0; -	 -	/* find most preferred working timer */ -	while (timers[i]) { -		if (timers[i]->init) -			if (timers[i]->init(clock_override) == 0) -				return timers[i]->opts; -		++i; -	} -		 -	panic("select_timer: Cannot find a suitable timer\n"); -	return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ -	if (cur_timer->read_timer) { -		*timer_val = cur_timer->read_timer(); -		return 0; -	} -	return -1; -} diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c deleted file mode 100644 index 13892a65c941..000000000000 --- a/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/*	Cyclone-timer:  - *		This code implements timer_ops for the cyclone counter found - *		on IBM x440, x360, and other Summit based systems. - * - *	Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/i8253.h> - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ -	do{ \ -		high = cyclone_timer[1]; low = cyclone_timer[0]; \ -	} while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ -	unsigned long lost, delay; -	unsigned long delta = last_cyclone_low; -	int count; -	unsigned long long this_offset, last_offset; - -	write_seqlock(&monotonic_lock); -	last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; -	 -	spin_lock(&i8253_lock); -	read_cyclone_counter(last_cyclone_low,last_cyclone_high); - -	/* read values for delay_at_last_interrupt */ -	outb_p(0x00, 0x43);     /* latch the count ASAP */ - -	count = inb_p(0x40);    /* read the latched count */ -	count |= inb(0x40) << 8; - -	/* -	 * VIA686a test code... reset the latch if count > max + 1 -	 * from timer_pit.c - cjb -	 */ -	if (count > LATCH) { -		outb_p(0x34, PIT_MODE); -		outb_p(LATCH & 0xff, PIT_CH0); -		outb(LATCH >> 8, PIT_CH0); -		count = LATCH - 1; -	} -	spin_unlock(&i8253_lock); - -	/* lost tick compensation */ -	delta = last_cyclone_low - delta;	 -	delta /= (CYCLONE_TIMER_FREQ/1000000); -	delta += delay_at_last_interrupt; -	lost = delta/(1000000/HZ); -	delay = delta%(1000000/HZ); -	if (lost >= 2) -		jiffies_64 += lost-1; -	 -	/* update the monotonic base value */ -	this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; -	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; -	write_sequnlock(&monotonic_lock); - -	/* calculate delay_at_last_interrupt */ -	count = ((LATCH-1) - count) * TICK_SIZE; -	delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - -	/* catch corner case where tick rollover occured  -	 * between cyclone and pit reads (as noted when  -	 * usec delta is > 90% # of usecs/tick) -	 */ -	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) -		jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ -	u32 offset; - -	if(!cyclone_timer) -		return delay_at_last_interrupt; - -	/* Read the cyclone timer */ -	offset = cyclone_timer[0]; - -	/* .. relative to previous jiffy */ -	offset = offset - last_cyclone_low; - -	/* convert cyclone ticks to microseconds */	 -	/* XXX slow, can we speed this up? */ -	offset = offset/(CYCLONE_TIMER_FREQ/1000000); - -	/* our adjusted time offset in microseconds */ -	return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ -	u32 now_low, now_high; -	unsigned long long last_offset, this_offset, base; -	unsigned long long ret; -	unsigned seq; - -	/* atomically read monotonic base & last_offset */ -	do { -		seq = read_seqbegin(&monotonic_lock); -		last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; -		base = monotonic_base; -	} while (read_seqretry(&monotonic_lock, seq)); - - -	/* Read the cyclone counter */ -	read_cyclone_counter(now_low,now_high); -	this_offset = ((unsigned long long)now_high<<32)|now_low; - -	/* convert to nanoseconds */ -	ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); -	return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ -	u32* reg;	 -	u32 base;		/* saved cyclone base address */ -	u32 pageaddr;	/* page that contains cyclone_timer register */ -	u32 offset;		/* offset from pageaddr to cyclone_timer register */ -	int i; -	 -	/* check clock override */ -	if (override[0] && strncmp(override,"cyclone",7)) -			return -ENODEV; - -	/*make sure we're on a summit box*/ -	if(!use_cyclone) return -ENODEV;  -	 -	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - -	/* find base address */ -	pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; -	offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); -	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); -	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); -	if(!reg){ -		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); -		return -ENODEV; -	} -	base = *reg;	 -	if(!base){ -		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); -		return -ENODEV; -	} -	 -	/* setup PMCC */ -	pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; -	offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); -	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); -	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); -	if(!reg){ -		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); -		return -ENODEV; -	} -	reg[0] = 0x00000001; - -	/* setup MPCS */ -	pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; -	offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); -	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); -	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); -	if(!reg){ -		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); -		return -ENODEV; -	} -	reg[0] = 0x00000001; - -	/* map in cyclone_timer */ -	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; -	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); -	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); -	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); -	if(!cyclone_timer){ -		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); -		return -ENODEV; -	} - -	/*quick test to make sure its ticking*/ -	for(i=0; i<3; i++){ -		u32 old = cyclone_timer[0]; -		int stall = 100; -		while(stall--) barrier(); -		if(cyclone_timer[0] == old){ -			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); -			cyclone_timer = 0; -			return -ENODEV; -		} -	} - -	init_cpu_khz(); - -	/* Everything looks good! */ -	return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ -	unsigned long bclock, now; -	if(!cyclone_timer) -		return; -	bclock = cyclone_timer[0]; -	do { -		rep_nop(); -		now = cyclone_timer[0]; -	} while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { -	.name = "cyclone", -	.mark_offset = mark_offset_cyclone,  -	.get_offset = get_offset_cyclone, -	.monotonic_clock =	monotonic_clock_cyclone, -	.delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { -	.init = init_cyclone, -	.opts = &timer_cyclone, -}; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c deleted file mode 100644 index 17a6fe7166e7..000000000000 --- a/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" -#include <asm/hpet.h> - -static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */ -static unsigned long hpet_last; 	/* hpet counter value at last tick*/ -static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - *  basic equation: - *		ns = cycles / (freq / ns_per_sec) - *		ns = cycles * (ns_per_sec / freq) - *		ns = cycles * (10^9 / (cpu_khz * 10^3)) - *		ns = cycles * (10^6 / cpu_khz) - * - *	Then we use scaling math (suggested by george@mvista.com) to get: - *		ns = cycles * (10^6 * SC / cpu_khz) / SC - *		ns = cycles * cyc2ns_scale / SC - * - *	And since SC is a constant power of two, we can convert the div - *  into a shift. - * - *  We can use khz divisor instead of mhz to keep a better percision, since - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - *  (mathieu.desnoyers@polymtl.ca) - * - *			-johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ -	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ -	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ -	unsigned long long last_offset, this_offset, base; -	unsigned seq; - -	/* atomically read monotonic base & last_offset */ -	do { -		seq = read_seqbegin(&monotonic_lock); -		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -		base = monotonic_base; -	} while (read_seqretry(&monotonic_lock, seq)); - -	/* Read the Time Stamp Counter */ -	rdtscll(this_offset); - -	/* return the value in ns */ -	return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ -	register unsigned long eax, edx; - -	eax = hpet_readl(HPET_COUNTER); -	eax -= hpet_last;	/* hpet delta */ -	eax = min(hpet_tick, eax); -	/* -         * Time offset = (hpet delta) * ( usecs per HPET clock ) -	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick) -	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32) -	 * -	 * Where, -	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick -	 * -	 * Using a mull instead of a divl saves some cycles in critical path. -         */ -	ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - -	/* our adjusted time offset in microseconds */ -	return edx; -} - -static void mark_offset_hpet(void) -{ -	unsigned long long this_offset, last_offset; -	unsigned long offset; - -	write_seqlock(&monotonic_lock); -	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	rdtsc(last_tsc_low, last_tsc_high); - -	if (hpet_use_timer) -		offset = hpet_readl(HPET_T0_CMP) - hpet_tick; -	else -		offset = hpet_readl(HPET_COUNTER); -	if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { -		int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; -		jiffies_64 += lost_ticks; -	} -	hpet_last = offset; - -	/* update the monotonic base value */ -	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	monotonic_base += cycles_2_ns(this_offset - last_offset); -	write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ -	unsigned long hpet_start, hpet_end; -	unsigned long eax; - -	/* loops is the number of cpu cycles. Convert it to hpet clocks */ -	ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - -	hpet_start = hpet_readl(HPET_COUNTER); -	do { -		rep_nop(); -		hpet_end = hpet_readl(HPET_COUNTER); -	} while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ -	unsigned long result, remain; - -	/* check clock override */ -	if (override[0] && strncmp(override,"hpet",4)) -		return -ENODEV; - -	if (!is_hpet_enabled()) -		return -ENODEV; - -	printk("Using HPET for gettimeofday\n"); -	if (cpu_has_tsc) { -		unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); -		if (tsc_quotient) { -			/* report CPU clock rate in Hz. -			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = -			 * clock/second. Our precision is about 100 ppm. -			 */ -			{	unsigned long eax=0, edx=1000; -				ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, -						eax, edx); -				printk("Detected %u.%03u MHz processor.\n", -					cpu_khz / 1000, cpu_khz % 1000); -			} -			set_cyc2ns_scale(cpu_khz); -		} -		/* set this only when cpu_has_tsc */ -		timer_hpet.read_timer = read_timer_tsc; -	} - -	/* -	 * Math to calculate hpet to usec multiplier -	 * Look for the comments at get_offset_hpet() -	 */ -	ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); -	if (remain > (hpet_tick >> 1)) -		result++; /* rounding the result */ -	hpet_usec_quotient = result; - -	return 0; -} - -static int hpet_resume(void) -{ -	write_seqlock(&monotonic_lock); -	/* Assume this is the last mark offset time */ -	rdtsc(last_tsc_low, last_tsc_high); - -	if (hpet_use_timer) -		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; -	else -		hpet_last = hpet_readl(HPET_COUNTER); -	write_sequnlock(&monotonic_lock); -	return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { -	.name = 		"hpet", -	.mark_offset =		mark_offset_hpet, -	.get_offset =		get_offset_hpet, -	.monotonic_clock =	monotonic_clock_hpet, -	.delay = 		delay_hpet, -	.resume	=		hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { -	.init =	init_hpet, -	.opts = &timer_hpet, -}; diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c deleted file mode 100644 index 4ea2f414dbbd..000000000000 --- a/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include <linux/init.h> -#include <asm/timer.h> - -static void mark_offset_none(void) -{ -	/* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ -	return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ -	return 0; -} - -static void delay_none(unsigned long loops) -{ -	int d0; -	__asm__ __volatile__( -		"\tjmp 1f\n" -		".align 16\n" -		"1:\tjmp 2f\n" -		".align 16\n" -		"2:\tdecl %0\n\tjns 2b" -		:"=&a" (d0) -		:"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { -	.name = 	"none", -	.mark_offset =	mark_offset_none,  -	.get_offset =	get_offset_none, -	.monotonic_clock =	monotonic_clock_none, -	.delay = delay_none, -}; diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c deleted file mode 100644 index b9b6bd56b9ba..000000000000 --- a/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/timex.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> -#include <asm/i8253.h> - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - 	/* check clock override */ - 	if (override[0] && strncmp(override,"pit",3)) - 		printk(KERN_ERR "Warning: clock= override failed. Defaulting " -				"to PIT\n"); - 	init_cpu_khz(); -	count_p = LATCH; -	return 0; -} - -static void mark_offset_pit(void) -{ -	/* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ -	return 0; -} - -static void delay_pit(unsigned long loops) -{ -	int d0; -	__asm__ __volatile__( -		"\tjmp 1f\n" -		".align 16\n" -		"1:\tjmp 2f\n" -		".align 16\n" -		"2:\tdecl %0\n\tjns 2b" -		:"=&a" (d0) -		:"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD.  -- jrs - *  - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that  - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought....		pjsg (1993-08-28) - *  - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ -	int count; -	unsigned long flags; -	static unsigned long jiffies_p = 0; - -	/* -	 * cache volatile jiffies temporarily; we have xtime_lock.  -	 */ -	unsigned long jiffies_t; - -	spin_lock_irqsave(&i8253_lock, flags); -	/* timer count may underflow right here */ -	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */ - -	count = inb_p(PIT_CH0);	/* read the latched count */ - -	/* -	 * We do this guaranteed double memory access instead of a _p  -	 * postfix in the previous port access. Wheee, hackady hack -	 */ - 	jiffies_t = jiffies; - -	count |= inb_p(PIT_CH0) << 8; -	 -        /* VIA686a test code... reset the latch if count > max + 1 */ -        if (count > LATCH) { -                outb_p(0x34, PIT_MODE); -                outb_p(LATCH & 0xff, PIT_CH0); -                outb(LATCH >> 8, PIT_CH0); -                count = LATCH - 1; -        } -	 -	/* -	 * avoiding timer inconsistencies (they are rare, but they happen)... -	 * there are two kinds of problems that must be avoided here: -	 *  1. the timer counter underflows -	 *  2. hardware problem with the timer, not giving us continuous time, -	 *     the counter does small "jumps" upwards on some Pentium systems, -	 *     (see c't 95/10 page 335 for Neptun bug.) -	 */ - -	if( jiffies_t == jiffies_p ) { -		if( count > count_p ) { -			/* the nutcase */ -			count = do_timer_overflow(count); -		} -	} else -		jiffies_p = jiffies_t; - -	count_p = count; - -	spin_unlock_irqrestore(&i8253_lock, flags); - -	count = ((LATCH-1) - count) * TICK_SIZE; -	count = (count + LATCH/2) / LATCH; - -	return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { -	.name = "pit", -	.mark_offset = mark_offset_pit,  -	.get_offset = get_offset_pit, -	.monotonic_clock = monotonic_clock_pit, -	.delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { -	.init = init_pit,  -	.opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ -	unsigned long flags; - -	spin_lock_irqsave(&i8253_lock, flags); -	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */ -	udelay(10); -	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */ -	udelay(10); -	outb(LATCH >> 8 , PIT_CH0);	/* MSB */ -	spin_unlock_irqrestore(&i8253_lock, flags); -} diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c deleted file mode 100644 index 144e94a04933..000000000000 --- a/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <asm/types.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> - -#include <linux/timex.h> -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ -  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static int pmtmr_need_workaround __read_mostly = 1; - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ -	if (pmtmr_need_workaround) { -		u32 v1, v2, v3; - -		/* It has been reported that because of various broken -		 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time -		 * source is not latched, so you must read it multiple -		 * times to insure a safe value is read. -		 */ -		do { -			v1 = inl(pmtmr_ioport); -			v2 = inl(pmtmr_ioport); -			v3 = inl(pmtmr_ioport); -		} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) -			 || (v3 > v1 && v3 < v2)); - -		/* mask the output to 24 bits */ -		return v2 & ACPI_PM_MASK; -	} - -	return inl(pmtmr_ioport) & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ -	u32 value1, value2; -	unsigned long count, delta; - -	mach_prepare_counter(); -	value1 = read_pmtmr(); -	mach_countup(&count); -	value2 = read_pmtmr(); -	delta = (value2 - value1) & ACPI_PM_MASK; - -	/* Check that the PMTMR delta is within 5% of what we expect */ -	if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || -	    delta > (PMTMR_EXPECTED_RATE * 21) / 20) { -		printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); -		return -1; -	} - -	return 0; -} - - -static int init_pmtmr(char* override) -{ -	u32 value1, value2; -	unsigned int i; - - 	if (override[0] && strncmp(override,"pmtmr",5)) -		return -ENODEV; - -	if (!pmtmr_ioport) -		return -ENODEV; - -	/* we use the TSC for delay_pmtmr, so make sure it exists */ -	if (!cpu_has_tsc) -		return -ENODEV; - -	/* "verify" this timing source */ -	value1 = read_pmtmr(); -	for (i = 0; i < 10000; i++) { -		value2 = read_pmtmr(); -		if (value2 == value1) -			continue; -		if (value2 > value1) -			goto pm_good; -		if ((value2 < value1) && ((value2) < 0xFFF)) -			goto pm_good; -		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); -		return -EINVAL; -	} -	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); -	return -ENODEV; - -pm_good: -	if (verify_pmtmr_rate() != 0) -		return -ENODEV; - -	init_cpu_khz(); -	return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ -	/* The Power Management Timer ticks at 3.579545 ticks per microsecond. -	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] -	 * -	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can -	 * easily be multiplied with 286 (=0x11E) without having to fear -	 * u32 overflows. -	 */ -	cycles *= 286; -	return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - *   - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ -	u32 lost, delta, last_offset; -	static int first_run = 1; -	last_offset = offset_tick; - -	write_seqlock(&monotonic_lock); - -	offset_tick = read_pmtmr(); - -	/* calculate tick interval */ -	delta = (offset_tick - last_offset) & ACPI_PM_MASK; - -	/* convert to usecs */ -	delta = cyc2us(delta); - -	/* update the monotonic base value */ -	monotonic_base += delta * NSEC_PER_USEC; -	write_sequnlock(&monotonic_lock); - -	/* convert to ticks */ -	delta += offset_delay; -	lost = delta / (USEC_PER_SEC / HZ); -	offset_delay = delta % (USEC_PER_SEC / HZ); - - -	/* compensate for lost ticks */ -	if (lost >= 2) -		jiffies_64 += lost - 1; - -	/* don't calculate delay for first run, -	   or if we've got less then a tick */ -	if (first_run || (lost < 1)) { -		first_run = 0; -		offset_delay = 0; -	} -} - -static int pmtmr_resume(void) -{ -	write_seqlock(&monotonic_lock); -	/* Assume this is the last mark offset time */ -	offset_tick = read_pmtmr(); -	write_sequnlock(&monotonic_lock); -	return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ -	u32 last_offset, this_offset; -	unsigned long long base, ret; -	unsigned seq; - - -	/* atomically read monotonic base & last_offset */ -	do { -		seq = read_seqbegin(&monotonic_lock); -		last_offset = offset_tick; -		base = monotonic_base; -	} while (read_seqretry(&monotonic_lock, seq)); - -	/* Read the pmtmr */ -	this_offset =  read_pmtmr(); - -	/* convert to nanoseconds */ -	ret = (this_offset - last_offset) & ACPI_PM_MASK; -	ret = base + (cyc2us(ret) * NSEC_PER_USEC); -	return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ -	unsigned long bclock, now; - -	rdtscl(bclock); -	do -	{ -		rep_nop(); -		rdtscl(now); -	} while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - *	- Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ -	u32 now, offset, delta = 0; - -	offset = offset_tick; -	now = read_pmtmr(); -	delta = (now - offset)&ACPI_PM_MASK; - -	return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { -	.name			= "pmtmr", -	.mark_offset		= mark_offset_pmtmr, -	.get_offset		= get_offset_pmtmr, -	.monotonic_clock 	= monotonic_clock_pmtmr, -	.delay 			= delay_pmtmr, -	.read_timer 		= read_timer_tsc, -	.resume			= pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { -	.init = init_pmtmr, -	.opts = &timer_pmtmr, -}; - -#ifdef CONFIG_PCI -/* - * PIIX4 Errata: - * - * The power management timer may return improper results when read. - * Although the timer value settles properly after incrementing, - * while incrementing there is a 3 ns window every 69.8 ns where the - * timer value is indeterminate (a 4.2% chance that the data will be - * incorrect when read). As a result, the ACPI free running count up - * timer specification is violated due to erroneous reads. - */ -static int __init pmtmr_bug_check(void) -{ -	static struct pci_device_id gray_list[] __initdata = { -		/* these chipsets may have bug. */ -		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, -				PCI_DEVICE_ID_INTEL_82801DB_0) }, -		{ }, -	}; -	struct pci_dev *dev; -	int pmtmr_has_bug = 0; -	u8 rev; - -	if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) -		return 0; - -	dev = pci_get_device(PCI_VENDOR_ID_INTEL, -			     PCI_DEVICE_ID_INTEL_82371AB_3, NULL); -	if (dev) { -		pci_read_config_byte(dev, PCI_REVISION_ID, &rev); -		/* the bug has been fixed in PIIX4M */ -		if (rev < 3) { -			printk(KERN_WARNING "* Found PM-Timer Bug on this " -				"chipset. Due to workarounds for a bug,\n" -				"* this time source is slow.  Consider trying " -				"other time sources (clock=)\n"); -			pmtmr_has_bug = 1; -		} -		pci_dev_put(dev); -	} - -	if (pci_dev_present(gray_list)) { -		printk(KERN_WARNING "* This chipset may have PM-Timer Bug.  Due" -			" to workarounds for a bug,\n" -			"* this time source is slow. If you are sure your timer" -			" does not have\n" -			"* this bug, please use \"pmtmr_good\" to disable the " -			"workaround\n"); -		pmtmr_has_bug = 1; -	} - -	if (!pmtmr_has_bug) -		pmtmr_need_workaround = 0; - -	return 0; -} -device_initcall(pmtmr_bug_check); -#endif - -static int __init pmtr_good_setup(char *__str) -{ -	pmtmr_need_workaround = 0; -	return 1; -} -__setup("pmtmr_good", pmtr_good_setup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c deleted file mode 100644 index f1187ddb0d0f..000000000000 --- a/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25    Jesper Juhl - *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - *      failing to inline. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/cpufreq.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -/* processor.h for distable_tsc flag */ -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" - -#include <asm/hpet.h> -#include <asm/i8253.h> - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* Avoid compensating for lost ticks before TSCs are synched */ -static int detect_lost_ticks; -static int __init start_lost_tick_compensation(void) -{ -	detect_lost_ticks = 1; -	return 0; -} -late_initcall(start_lost_tick_compensation); - -/* convert from cycles(64bits) => nanoseconds (64bits) - *  basic equation: - *		ns = cycles / (freq / ns_per_sec) - *		ns = cycles * (ns_per_sec / freq) - *		ns = cycles * (10^9 / (cpu_khz * 10^3)) - *		ns = cycles * (10^6 / cpu_khz) - * - *	Then we use scaling math (suggested by george@mvista.com) to get: - *		ns = cycles * (10^6 * SC / cpu_khz) / SC - *		ns = cycles * cyc2ns_scale / SC - * - *	And since SC is a constant power of two, we can convert the div - *  into a shift. - * - *  We can use khz divisor instead of mhz to keep a better percision, since - *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - *  (mathieu.desnoyers@polymtl.ca) - * - *			-johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ -	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ -	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ -	register unsigned long eax, edx; - -	/* Read the Time Stamp Counter */ - -	rdtsc(eax,edx); - -	/* .. relative to previous jiffy (32 bits is enough) */ -	eax -= last_tsc_low;	/* tsc_low delta */ - -	/* -         * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient -         *             = (tsc_low delta) * (usecs_per_clock) -         *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) -	 * -	 * Using a mull instead of a divl saves up to 31 clock cycles -	 * in the critical path. -         */ - -	__asm__("mull %2" -		:"=a" (eax), "=d" (edx) -		:"rm" (fast_gettimeoffset_quotient), -		 "0" (eax)); - -	/* our adjusted time offset in microseconds */ -	return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ -	unsigned long long last_offset, this_offset, base; -	unsigned seq; -	 -	/* atomically read monotonic base & last_offset */ -	do { -		seq = read_seqbegin(&monotonic_lock); -		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -		base = monotonic_base; -	} while (read_seqretry(&monotonic_lock, seq)); - -	/* Read the Time Stamp Counter */ -	rdtscll(this_offset); - -	/* return the value in ns */ -	return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ -	unsigned long long this_offset; - -	/* -	 * In the NUMA case we dont use the TSC as they are not -	 * synchronized across all CPUs. -	 */ -#ifndef CONFIG_NUMA -	if (!use_tsc) -#endif -		/* no locking but a rare wrong value is not a big deal */ -		return jiffies_64 * (1000000000 / HZ); - -	/* Read the Time Stamp Counter */ -	rdtscll(this_offset); - -	/* return the value in ns */ -	return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ -	unsigned long bclock, now; -	 -	rdtscl(bclock); -	do -	{ -		rep_nop(); -		rdtscl(now); -	} while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ -	unsigned long long this_offset, last_offset; - 	unsigned long offset, temp, hpet_current; - -	write_seqlock(&monotonic_lock); -	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	/* -	 * It is important that these two operations happen almost at -	 * the same time. We do the RDTSC stuff first, since it's -	 * faster. To avoid any inconsistencies, we need interrupts -	 * disabled locally. -	 */ -	/* -	 * Interrupts are just disabled locally since the timer irq -	 * has the SA_INTERRUPT flag set. -arca -	 */ -	/* read Pentium cycle counter */ - -	hpet_current = hpet_readl(HPET_COUNTER); -	rdtsc(last_tsc_low, last_tsc_high); - -	/* lost tick compensation */ -	offset = hpet_readl(HPET_T0_CMP) - hpet_tick; -	if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0)) -					&& detect_lost_ticks) { -		int lost_ticks = (offset - hpet_last) / hpet_tick; -		jiffies_64 += lost_ticks; -	} -	hpet_last = hpet_current; - -	/* update the monotonic base value */ -	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	monotonic_base += cycles_2_ns(this_offset - last_offset); -	write_sequnlock(&monotonic_lock); - -	/* calculate delay_at_last_interrupt */ -	/* -	 * Time offset = (hpet delta) * ( usecs per HPET clock ) -	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick) -	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32) -	 * Where, -	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick -	 */ -	delay_at_last_interrupt = hpet_current - offset; -	ASM_MUL64_REG(temp, delay_at_last_interrupt, -			hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ -	unsigned int cpu; -	for_each_online_cpu(cpu) { -		cpufreq_get(cpu); -	} -	cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void)  -{ -	if (cpufreq_init && !cpufreq_delayed_issched) { -		cpufreq_delayed_issched = 1; -		printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); -		schedule_work(&cpufreq_delayed_get_work); -	} -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int  ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, -		       void *data) -{ -	struct cpufreq_freqs *freq = data; - -	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) -		write_seqlock_irq(&xtime_lock); -	if (!ref_freq) { -		if (!freq->old){ -			ref_freq = freq->new; -			goto end; -		} -		ref_freq = freq->old; -		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP -		fast_gettimeoffset_ref = fast_gettimeoffset_quotient; -		cpu_khz_ref = cpu_khz; -#endif -	} - -	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) || -	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || -	    (val == CPUFREQ_RESUMECHANGE)) { -		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP -		if (cpu_khz) -			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); -		if (use_tsc) { -			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { -				fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); -				set_cyc2ns_scale(cpu_khz); -			} -		} -#endif -	} - -end: -	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) -		write_sequnlock_irq(&xtime_lock); - -	return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { -	.notifier_call	= time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ -	int ret; -	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); -	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, -					CPUFREQ_TRANSITION_NOTIFIER); -	if (!ret) -		cpufreq_init = 1; -	return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif  - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP -	unsigned int cpu_khz_old = cpu_khz; - -	if (cpu_has_tsc) { -		local_irq_disable(); -		init_cpu_khz(); -		local_irq_enable(); -		cpu_data[0].loops_per_jiffy = -		    cpufreq_scale(cpu_data[0].loops_per_jiffy, -			          cpu_khz_old, -				  cpu_khz); -		return 0; -	} else -		return -ENODEV; -#else -	return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ -	unsigned long lost,delay; -	unsigned long delta = last_tsc_low; -	int count; -	int countmp; -	static int count1 = 0; -	unsigned long long this_offset, last_offset; -	static int lost_count = 0; - -	write_seqlock(&monotonic_lock); -	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	/* -	 * It is important that these two operations happen almost at -	 * the same time. We do the RDTSC stuff first, since it's -	 * faster. To avoid any inconsistencies, we need interrupts -	 * disabled locally. -	 */ - -	/* -	 * Interrupts are just disabled locally since the timer irq -	 * has the SA_INTERRUPT flag set. -arca -	 */ - -	/* read Pentium cycle counter */ - -	rdtsc(last_tsc_low, last_tsc_high); - -	spin_lock(&i8253_lock); -	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */ - -	count = inb_p(PIT_CH0);    /* read the latched count */ -	count |= inb(PIT_CH0) << 8; - -	/* -	 * VIA686a test code... reset the latch if count > max + 1 -	 * from timer_pit.c - cjb -	 */ -	if (count > LATCH) { -		outb_p(0x34, PIT_MODE); -		outb_p(LATCH & 0xff, PIT_CH0); -		outb(LATCH >> 8, PIT_CH0); -		count = LATCH - 1; -	} - -	spin_unlock(&i8253_lock); - -	if (pit_latch_buggy) { -		/* get center value of last 3 time lutch */ -		if ((count2 >= count && count >= count1) -		    || (count1 >= count && count >= count2)) { -			count2 = count1; count1 = count; -		} else if ((count1 >= count2 && count2 >= count) -			   || (count >= count2 && count2 >= count1)) { -			countmp = count;count = count2; -			count2 = count1;count1 = countmp; -		} else { -			count2 = count1; count1 = count; count = count1; -		} -	} - -	/* lost tick compensation */ -	delta = last_tsc_low - delta; -	{ -		register unsigned long eax, edx; -		eax = delta; -		__asm__("mull %2" -		:"=a" (eax), "=d" (edx) -		:"rm" (fast_gettimeoffset_quotient), -		 "0" (eax)); -		delta = edx; -	} -	delta += delay_at_last_interrupt; -	lost = delta/(1000000/HZ); -	delay = delta%(1000000/HZ); -	if (lost >= 2 && detect_lost_ticks) { -		jiffies_64 += lost-1; - -		/* sanity check to ensure we're not always losing ticks */ -		if (lost_count++ > 100) { -			printk(KERN_WARNING "Losing too many ticks!\n"); -			printk(KERN_WARNING "TSC cannot be used as a timesource.  \n"); -			printk(KERN_WARNING "Possible reasons for this are:\n"); -			printk(KERN_WARNING "  You're running with Speedstep,\n"); -			printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n"); -			printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n"); -			printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - -			clock_fallback(); -		} -		/* ... but give the TSC a fair chance */ -		if (lost_count > 25) -			cpufreq_delayed_get(); -	} else -		lost_count = 0; -	/* update the monotonic base value */ -	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; -	monotonic_base += cycles_2_ns(this_offset - last_offset); -	write_sequnlock(&monotonic_lock); - -	/* calculate delay_at_last_interrupt */ -	count = ((LATCH-1) - count) * TICK_SIZE; -	delay_at_last_interrupt = (count + LATCH/2) / LATCH; - -	/* catch corner case where tick rollover occured -	 * between tsc and pit reads (as noted when -	 * usec delta is > 90% # of usecs/tick) -	 */ -	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) -		jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - -	/* check clock override */ -	if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER -		if (is_hpet_enabled()) { -			printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); -		} else -#endif -		{ -			return -ENODEV; -		} -	} - -	/* -	 * If we have APM enabled or the CPU clock speed is variable -	 * (CPU stops clock on HLT or slows clock to save power) -	 * then the TSC timestamps may diverge by up to 1 jiffy from -	 * 'real time' but nothing will break. -	 * The most frequent case is that the CPU is "woken" from a halt -	 * state by the timer interrupt itself, so we get 0 error. In the -	 * rare cases where a driver would "wake" the CPU and request a -	 * timestamp, the maximum error is < 1 jiffy. But timestamps are -	 * still perfectly ordered. -	 * Note that the TSC counter will be reset if APM suspends -	 * to disk; this won't break the kernel, though, 'cuz we're -	 * smart.  See arch/i386/kernel/apm.c. -	 */ - 	/* - 	 *	Firstly we have to do a CPU check for chips with - 	 * 	a potentially buggy TSC. At this point we haven't run - 	 *	the ident/bugs checks so we must run this hook as it - 	 *	may turn off the TSC flag. - 	 * - 	 *	NOTE: this doesn't yet handle SMP 486 machines where only - 	 *	some CPU's have a TSC. Thats never worked and nobody has - 	 *	moaned if you have the only one in the world - you fix it! - 	 */ - -	count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - -	if (cpu_has_tsc) { -		unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER -		if (is_hpet_enabled() && hpet_use_timer) { -			unsigned long result, remain; -			printk("Using TSC for gettimeofday\n"); -			tsc_quotient = calibrate_tsc_hpet(NULL); -			timer_tsc.mark_offset = &mark_offset_tsc_hpet; -			/* -			 * Math to calculate hpet to usec multiplier -			 * Look for the comments at get_offset_tsc_hpet() -			 */ -			ASM_DIV64_REG(result, remain, hpet_tick, -					0, KERNEL_TICK_USEC); -			if (remain > (hpet_tick >> 1)) -				result++; /* rounding the result */ - -			hpet_usec_quotient = result; -		} else -#endif -		{ -			tsc_quotient = calibrate_tsc(); -		} - -		if (tsc_quotient) { -			fast_gettimeoffset_quotient = tsc_quotient; -			use_tsc = 1; -			/* -			 *	We could be more selective here I suspect -			 *	and just enable this for the next intel chips ? -			 */ -			/* report CPU clock rate in Hz. -			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = -			 * clock/second. Our precision is about 100 ppm. -			 */ -			{	unsigned long eax=0, edx=1000; -				__asm__("divl %2" -		       		:"=a" (cpu_khz), "=d" (edx) -        	       		:"r" (tsc_quotient), -	                	"0" (eax), "1" (edx)); -				printk("Detected %u.%03u MHz processor.\n", -					cpu_khz / 1000, cpu_khz % 1000); -			} -			set_cyc2ns_scale(cpu_khz); -			return 0; -		} -	} -	return -ENODEV; -} - -static int tsc_resume(void) -{ -	write_seqlock(&monotonic_lock); -	/* Assume this is the last mark offset time */ -	rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER -	if (is_hpet_enabled() && hpet_use_timer) -		hpet_last = hpet_readl(HPET_COUNTER); -#endif -	write_sequnlock(&monotonic_lock); -	return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ -	tsc_disable = 1; -	return 1; -} -#else -static int __init tsc_setup(char *str) -{ -	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " -				"cannot disable TSC.\n"); -	return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { -	.name = "tsc", -	.mark_offset = mark_offset_tsc,  -	.get_offset = get_offset_tsc, -	.monotonic_clock = monotonic_clock_tsc, -	.delay = delay_tsc, -	.read_timer = read_timer_tsc, -	.resume	= tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { -	.init = init_tsc, -	.opts = &timer_tsc, -}; diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index dcc14477af1f..78464097470a 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -28,6 +28,7 @@  #include <linux/utsname.h>  #include <linux/kprobes.h>  #include <linux/kexec.h> +#include <linux/unwind.h>  #ifdef CONFIG_EISA  #include <linux/ioport.h> @@ -47,7 +48,7 @@  #include <asm/desc.h>  #include <asm/i387.h>  #include <asm/nmi.h> - +#include <asm/unwind.h>  #include <asm/smp.h>  #include <asm/arch_hooks.h>  #include <asm/kdebug.h> @@ -92,6 +93,7 @@ asmlinkage void spurious_interrupt_bug(void);  asmlinkage void machine_check(void);  static int kstack_depth_to_print = 24; +static int call_trace = 1;  ATOMIC_NOTIFIER_HEAD(i386die_chain);  int register_die_notifier(struct notifier_block *nb) @@ -170,7 +172,23 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,  	return ebp;  } -static void show_trace_log_lvl(struct task_struct *task, +static asmlinkage int show_trace_unwind(struct unwind_frame_info *info, void *log_lvl) +{ +	int n = 0; +	int printed = 0; /* nr of entries already printed on current line */ + +	while (unwind(info) == 0 && UNW_PC(info)) { +		++n; +		printed = print_addr_and_symbol(UNW_PC(info), log_lvl, printed); +		if (arch_unw_user_mode(info)) +			break; +	} +	if (printed) +		printk("\n"); +	return n; +} + +static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  			       unsigned long *stack, char *log_lvl)  {  	unsigned long ebp; @@ -178,6 +196,26 @@ static void show_trace_log_lvl(struct task_struct *task,  	if (!task)  		task = current; +	if (call_trace >= 0) { +		int unw_ret = 0; +		struct unwind_frame_info info; + +		if (regs) { +			if (unwind_init_frame_info(&info, task, regs) == 0) +				unw_ret = show_trace_unwind(&info, log_lvl); +		} else if (task == current) +			unw_ret = unwind_init_running(&info, show_trace_unwind, log_lvl); +		else { +			if (unwind_init_blocked(&info, task) == 0) +				unw_ret = show_trace_unwind(&info, log_lvl); +		} +		if (unw_ret > 0) { +			if (call_trace > 0) +				return; +			printk("%sLegacy call trace:\n", log_lvl); +		} +	} +  	if (task == current) {  		/* Grab ebp right from our regs */  		asm ("movl %%ebp, %0" : "=r" (ebp) : ); @@ -198,13 +236,13 @@ static void show_trace_log_lvl(struct task_struct *task,  	}  } -void show_trace(struct task_struct *task, unsigned long * stack) +void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack)  { -	show_trace_log_lvl(task, stack, ""); +	show_trace_log_lvl(task, regs, stack, "");  } -static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, -			       char *log_lvl) +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, +			       unsigned long *esp, char *log_lvl)  {  	unsigned long *stack;  	int i; @@ -225,13 +263,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,  		printk("%08lx ", *stack++);  	}  	printk("\n%sCall Trace:\n", log_lvl); -	show_trace_log_lvl(task, esp, log_lvl); +	show_trace_log_lvl(task, regs, esp, log_lvl);  }  void show_stack(struct task_struct *task, unsigned long *esp)  {  	printk("       "); -	show_stack_log_lvl(task, esp, ""); +	show_stack_log_lvl(task, NULL, esp, "");  }  /* @@ -241,7 +279,7 @@ void dump_stack(void)  {  	unsigned long stack; -	show_trace(current, &stack); +	show_trace(current, NULL, &stack);  }  EXPORT_SYMBOL(dump_stack); @@ -285,7 +323,7 @@ void show_registers(struct pt_regs *regs)  		u8 __user *eip;  		printk("\n" KERN_EMERG "Stack: "); -		show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG); +		show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);  		printk(KERN_EMERG "Code: "); @@ -1215,3 +1253,15 @@ static int __init kstack_setup(char *s)  	return 1;  }  __setup("kstack=", kstack_setup); + +static int __init call_trace_setup(char *s) +{ +	if (strcmp(s, "old") == 0) +		call_trace = -1; +	else if (strcmp(s, "both") == 0) +		call_trace = 0; +	else if (strcmp(s, "new") == 0) +		call_trace = 1; +	return 1; +} +__setup("call_trace=", call_trace_setup); diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c new file mode 100644 index 000000000000..7e0d8dab2075 --- /dev/null +++ b/arch/i386/kernel/tsc.c @@ -0,0 +1,478 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/cpufreq.h> +#include <linux/jiffies.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/delay.h> +#include <asm/tsc.h> +#include <asm/delay.h> +#include <asm/io.h> + +#include "mach_timer.h" + +/* + * On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __cpuinitdata = 0; + +#ifdef CONFIG_X86_TSC +static int __init tsc_setup(char *str) +{ +	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " +				"cannot disable TSC.\n"); +	return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +static int __init tsc_setup(char *str) +{ +	tsc_disable = 1; + +	return 1; +} +#endif + +__setup("notsc", tsc_setup); + +/* + * code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ +	return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ +	tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + *  basic equation: + *		ns = cycles / (freq / ns_per_sec) + *		ns = cycles * (ns_per_sec / freq) + *		ns = cycles * (10^9 / (cpu_khz * 10^3)) + *		ns = cycles * (10^6 / cpu_khz) + * + *	Then we use scaling math (suggested by george@mvista.com) to get: + *		ns = cycles * (10^6 * SC / cpu_khz) / SC + *		ns = cycles * cyc2ns_scale / SC + * + *	And since SC is a constant power of two, we can convert the div + *  into a shift. + * + *  We can use khz divisor instead of mhz to keep a better percision, since + *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + *  (mathieu.desnoyers@polymtl.ca) + * + *			-johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale __read_mostly; + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_khz) +{ +	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ +	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ +	unsigned long long this_offset; + +	/* +	 * in the NUMA case we dont use the TSC as they are not +	 * synchronized across all CPUs. +	 */ +#ifndef CONFIG_NUMA +	if (!cpu_khz || check_tsc_unstable()) +#endif +		/* no locking but a rare wrong value is not a big deal */ +		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + +	/* read the Time Stamp Counter: */ +	rdtscll(this_offset); + +	/* return the value in ns */ +	return cycles_2_ns(this_offset); +} + +static unsigned long calculate_cpu_khz(void) +{ +	unsigned long long start, end; +	unsigned long count; +	u64 delta64; +	int i; +	unsigned long flags; + +	local_irq_save(flags); + +	/* run 3 times to ensure the cache is warm */ +	for (i = 0; i < 3; i++) { +		mach_prepare_counter(); +		rdtscll(start); +		mach_countup(&count); +		rdtscll(end); +	} +	/* +	 * Error: ECTCNEVERSET +	 * The CTC wasn't reliable: we got a hit on the very first read, +	 * or the CPU was so fast/slow that the quotient wouldn't fit in +	 * 32 bits.. +	 */ +	if (count <= 1) +		goto err; + +	delta64 = end - start; + +	/* cpu freq too fast: */ +	if (delta64 > (1ULL<<32)) +		goto err; + +	/* cpu freq too slow: */ +	if (delta64 <= CALIBRATE_TIME_MSEC) +		goto err; + +	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ +	do_div(delta64,CALIBRATE_TIME_MSEC); + +	local_irq_restore(flags); +	return (unsigned long)delta64; +err: +	local_irq_restore(flags); +	return 0; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP +	unsigned long cpu_khz_old = cpu_khz; + +	if (cpu_has_tsc) { +		cpu_khz = calculate_cpu_khz(); +		tsc_khz = cpu_khz; +		cpu_data[0].loops_per_jiffy = +			cpufreq_scale(cpu_data[0].loops_per_jiffy, +					cpu_khz_old, cpu_khz); +		return 0; +	} else +		return -ENODEV; +#else +	return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +void tsc_init(void) +{ +	if (!cpu_has_tsc || tsc_disable) +		return; + +	cpu_khz = calculate_cpu_khz(); +	tsc_khz = cpu_khz; + +	if (!cpu_khz) +		return; + +	printk("Detected %lu.%03lu MHz processor.\n", +				(unsigned long)cpu_khz / 1000, +				(unsigned long)cpu_khz % 1000); + +	set_cyc2ns_scale(cpu_khz); +	use_tsc_delay(); +} + +#ifdef CONFIG_CPU_FREQ + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ +	unsigned int cpu; + +	for_each_online_cpu(cpu) +		cpufreq_get(cpu); + +	cpufreq_delayed_issched = 0; +} + +/* + * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ +	if (cpufreq_init && !cpufreq_delayed_issched) { +		cpufreq_delayed_issched = 1; +		printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); +		schedule_work(&cpufreq_delayed_get_work); +	} +} + +/* + * if the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned long cpu_khz_ref = 0; + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) +{ +	struct cpufreq_freqs *freq = data; + +	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) +		write_seqlock_irq(&xtime_lock); + +	if (!ref_freq) { +		if (!freq->old){ +			ref_freq = freq->new; +			goto end; +		} +		ref_freq = freq->old; +		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; +		cpu_khz_ref = cpu_khz; +	} + +	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) || +	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || +	    (val == CPUFREQ_RESUMECHANGE)) { +		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +			cpu_data[freq->cpu].loops_per_jiffy = +				cpufreq_scale(loops_per_jiffy_ref, +						ref_freq, freq->new); + +		if (cpu_khz) { + +			if (num_online_cpus() == 1) +				cpu_khz = cpufreq_scale(cpu_khz_ref, +						ref_freq, freq->new); +			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { +				tsc_khz = cpu_khz; +				set_cyc2ns_scale(cpu_khz); +				/* +				 * TSC based sched_clock turns +				 * to junk w/ cpufreq +				 */ +				mark_tsc_unstable(); +			} +		} +	} +end: +	if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) +		write_sequnlock_irq(&xtime_lock); + +	return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { +	.notifier_call	= time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ +	int ret; + +	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); +	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, +					CPUFREQ_TRANSITION_NOTIFIER); +	if (!ret) +		cpufreq_init = 1; + +	return ret; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* clock source code */ + +static unsigned long current_tsc_khz = 0; +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ +	cycle_t ret; + +	rdtscll(ret); + +	return ret; +} + +static struct clocksource clocksource_tsc = { +	.name			= "tsc", +	.rating			= 300, +	.read			= read_tsc, +	.mask			= CLOCKSOURCE_MASK(64), +	.mult			= 0, /* to be set */ +	.shift			= 22, +	.update_callback	= tsc_update_callback, +	.is_continuous		= 1, +}; + +static int tsc_update_callback(void) +{ +	int change = 0; + +	/* check to see if we should switch to the safe clocksource: */ +	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { +		clocksource_tsc.rating = 50; +		clocksource_reselect(); +		change = 1; +	} + +	/* only update if tsc_khz has changed: */ +	if (current_tsc_khz != tsc_khz) { +		current_tsc_khz = tsc_khz; +		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, +							clocksource_tsc.shift); +		change = 1; +	} + +	return change; +} + +static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) +{ +	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", +		       d->ident); +	mark_tsc_unstable(); +	return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { +	{ +	 .callback = dmi_mark_tsc_unstable, +	 .ident = "IBM Thinkpad 380XD", +	 .matches = { +		     DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), +		     DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), +		     }, +	 }, +	 {} +}; + +#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ +static struct timer_list verify_tsc_freq_timer; + +/* XXX - Probably should add locking */ +static void verify_tsc_freq(unsigned long unused) +{ +	static u64 last_tsc; +	static unsigned long last_jiffies; + +	u64 now_tsc, interval_tsc; +	unsigned long now_jiffies, interval_jiffies; + + +	if (check_tsc_unstable()) +		return; + +	rdtscll(now_tsc); +	now_jiffies = jiffies; + +	if (!last_jiffies) { +		goto out; +	} + +	interval_jiffies = now_jiffies - last_jiffies; +	interval_tsc = now_tsc - last_tsc; +	interval_tsc *= HZ; +	do_div(interval_tsc, cpu_khz*1000); + +	if (interval_tsc < (interval_jiffies * 3 / 4)) { +		printk("TSC appears to be running slowly. " +			"Marking it as unstable\n"); +		mark_tsc_unstable(); +		return; +	} + +out: +	last_tsc = now_tsc; +	last_jiffies = now_jiffies; +	/* set us up to go off on the next interval: */ +	mod_timer(&verify_tsc_freq_timer, +		jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ +	/* +	 * Intel systems are normally all synchronized. +	 * Exceptions must mark TSC as unstable: +	 */ +	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + 		return 0; + +	/* assume multi socket systems are not synchronized: */ + 	return num_possible_cpus() > 1; +} + +static int __init init_tsc_clocksource(void) +{ + +	if (cpu_has_tsc && tsc_khz && !tsc_disable) { +		/* check blacklist */ +		dmi_check_system(bad_tsc_dmi_table); + +		if (unsynchronized_tsc()) /* mark unstable if unsynced */ +			mark_tsc_unstable(); +		current_tsc_khz = tsc_khz; +		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, +							clocksource_tsc.shift); +		/* lower the rating if we already know its unstable: */ +		if (check_tsc_unstable()) +			clocksource_tsc.rating = 50; + +		init_timer(&verify_tsc_freq_timer); +		verify_tsc_freq_timer.function = verify_tsc_freq; +		verify_tsc_freq_timer.expires = +			jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); +		add_timer(&verify_tsc_freq_timer); + +		return clocksource_register(&clocksource_tsc); +	} + +	return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 7512f39c9f25..2d4f1386e2b1 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -71,6 +71,15 @@ SECTIONS    .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }    _edata = .;			/* End of data section */ +#ifdef CONFIG_STACK_UNWIND +  . = ALIGN(4); +  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { +	__start_unwind = .; +  	*(.eh_frame) +	__end_unwind = .; +  } +#endif +    . = ALIGN(THREAD_SIZE);	/* init_task */    .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {  	*(.data.init_task) diff --git a/arch/i386/lib/delay.c b/arch/i386/lib/delay.c index c49a6acbee56..3c0714c4b669 100644 --- a/arch/i386/lib/delay.c +++ b/arch/i386/lib/delay.c @@ -10,43 +10,92 @@   *	we have to worry about.   */ +#include <linux/module.h>  #include <linux/config.h>  #include <linux/sched.h>  #include <linux/delay.h> -#include <linux/module.h> +  #include <asm/processor.h>  #include <asm/delay.h>  #include <asm/timer.h>  #ifdef CONFIG_SMP -#include <asm/smp.h> +# include <asm/smp.h>  #endif -extern struct timer_opts* timer; +/* simple loop based delay: */ +static void delay_loop(unsigned long loops) +{ +	int d0; + +	__asm__ __volatile__( +		"\tjmp 1f\n" +		".align 16\n" +		"1:\tjmp 2f\n" +		".align 16\n" +		"2:\tdecl %0\n\tjns 2b" +		:"=&a" (d0) +		:"0" (loops)); +} + +/* TSC based delay: */ +static void delay_tsc(unsigned long loops) +{ +	unsigned long bclock, now; + +	rdtscl(bclock); +	do { +		rep_nop(); +		rdtscl(now); +	} while ((now-bclock) < loops); +} + +/* + * Since we calibrate only once at boot, this + * function should be set once at boot and not changed + */ +static void (*delay_fn)(unsigned long) = delay_loop; + +void use_tsc_delay(void) +{ +	delay_fn = delay_tsc; +} + +int read_current_timer(unsigned long *timer_val) +{ +	if (delay_fn == delay_tsc) { +		rdtscl(*timer_val); +		return 0; +	} +	return -1; +}  void __delay(unsigned long loops)  { -	cur_timer->delay(loops); +	delay_fn(loops);  }  inline void __const_udelay(unsigned long xloops)  {  	int d0; +  	xloops *= 4;  	__asm__("mull %0"  		:"=d" (xloops), "=&a" (d0) -		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); -        __delay(++xloops); +		:"1" (xloops), "0" +		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4))); + +	__delay(++xloops);  }  void __udelay(unsigned long usecs)  { -	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */ +	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */  }  void __ndelay(unsigned long nsecs)  { -	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */ +	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */  }  EXPORT_SYMBOL(__delay); diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index bd6fe96cc16d..6ee7faaf2c1b 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -30,6 +30,40 @@  extern void die(const char *,struct pt_regs *,long); +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); +int register_page_fault_notifier(struct notifier_block *nb) +{ +	vmalloc_sync_all(); +	return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	struct die_args args = { +		.regs = regs, +		.str = str, +		.err = err, +		.trapnr = trap, +		.signr = sig +	}; +	return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	return NOTIFY_DONE; +} +#endif + +  /*   * Unlock any spinlocks which will prevent us from getting the   * message out  @@ -324,7 +358,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,  	if (unlikely(address >= TASK_SIZE)) {  		if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0)  			return; -		if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, +		if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,  						SIGSEGV) == NOTIFY_STOP)  			return;  		/* @@ -334,7 +368,7 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,  		goto bad_area_nosemaphore;  	} -	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, +	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,  					SIGSEGV) == NOTIFY_STOP)  		return; diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index ec0fd3cfa774..fa8a37bcb391 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -281,9 +281,9 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root)  	for (i = 0; i < model->num_counters; ++i) {  		struct dentry * dir; -		char buf[2]; +		char buf[4]; -		snprintf(buf, 2, "%d", i); +		snprintf(buf,  sizeof(buf), "%d", i);  		dir = oprofilefs_mkdir(sb, root, buf);  		oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled);   		oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event);  diff --git a/arch/i386/oprofile/op_model_athlon.c b/arch/i386/oprofile/op_model_athlon.c index 3ad9a72a5036..693bdea4a52b 100644 --- a/arch/i386/oprofile/op_model_athlon.c +++ b/arch/i386/oprofile/op_model_athlon.c @@ -13,6 +13,7 @@  #include <linux/oprofile.h>  #include <asm/ptrace.h>  #include <asm/msr.h> +#include <asm/nmi.h>  #include "op_x86_model.h"  #include "op_counter.h" diff --git a/arch/i386/oprofile/op_model_p4.c b/arch/i386/oprofile/op_model_p4.c index ac8a066035c2..7c61d357b82b 100644 --- a/arch/i386/oprofile/op_model_p4.c +++ b/arch/i386/oprofile/op_model_p4.c @@ -14,6 +14,7 @@  #include <asm/ptrace.h>  #include <asm/fixmap.h>  #include <asm/apic.h> +#include <asm/nmi.h>  #include "op_x86_model.h"  #include "op_counter.h" diff --git a/arch/i386/oprofile/op_model_ppro.c b/arch/i386/oprofile/op_model_ppro.c index d719015fc044..5c3ab4b027ad 100644 --- a/arch/i386/oprofile/op_model_ppro.c +++ b/arch/i386/oprofile/op_model_ppro.c @@ -14,6 +14,7 @@  #include <asm/ptrace.h>  #include <asm/msr.h>  #include <asm/apic.h> +#include <asm/nmi.h>  #include "op_x86_model.h"  #include "op_counter.h" diff --git a/arch/i386/pci/pcbios.c b/arch/i386/pci/pcbios.c index 1eec0868f4b3..ed1512a175ab 100644 --- a/arch/i386/pci/pcbios.c +++ b/arch/i386/pci/pcbios.c @@ -371,8 +371,7 @@ void __devinit pcibios_sort(void)  			list_for_each(ln, &pci_devices) {  				d = pci_dev_g(ln);  				if (d->bus->number == bus && d->devfn == devfn) { -					list_del(&d->global_list); -					list_add_tail(&d->global_list, &sorted_devices); +					list_move_tail(&d->global_list, &sorted_devices);  					if (d == dev)  						found = 1;  					break; @@ -390,8 +389,7 @@ void __devinit pcibios_sort(void)  		if (!found) {  			printk(KERN_WARNING "PCI: Device %s not found by BIOS\n",  				pci_name(dev)); -			list_del(&dev->global_list); -			list_add_tail(&dev->global_list, &sorted_devices); +			list_move_tail(&dev->global_list, &sorted_devices);  		}  	}  	list_splice(&sorted_devices, &pci_devices); diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 355d57970ba3..b045c279136c 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -272,9 +272,9 @@ cpu_idle (void)  	/* endless idle loop with no priority at all */  	while (1) {  		if (can_do_pal_halt) -			clear_thread_flag(TIF_POLLING_NRFLAG); +			current_thread_info()->status &= ~TS_POLLING;  		else -			set_thread_flag(TIF_POLLING_NRFLAG); +			current_thread_info()->status |= TS_POLLING;  		if (!need_resched()) {  			void (*idle)(void); diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index d98ec49570b8..14ef7cceb208 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -19,6 +19,40 @@  extern void die (char *, struct pt_regs *, long); +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	struct die_args args = { +		.regs = regs, +		.str = str, +		.err = err, +		.trapnr = trap, +		.signr = sig +	}; +	return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	return NOTIFY_DONE; +} +#endif +  /*   * Return TRUE if ADDRESS points at a page in the kernel's mapped segment   * (inside region 5, on ia64) and that page is present. @@ -84,7 +118,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re  	/*  	 * This is to handle the kprobes on user space access instructions  	 */ -	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, code, TRAP_BRKPT, +	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, code, TRAP_BRKPT,  					SIGSEGV) == NOTIFY_STOP)  		return; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index d6d582a5abb0..a226668f20c3 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -94,8 +94,7 @@ pmd_t *get_pointer_table (void)  	PD_MARKBITS(dp) = mask & ~tmp;  	if (!PD_MARKBITS(dp)) {  		/* move to end of list */ -		list_del(dp); -		list_add_tail(dp, &ptable_list); +		list_move_tail(dp, &ptable_list);  	}  	return (pmd_t *) (page_address(PD_PAGE(dp)) + off);  } @@ -123,8 +122,7 @@ int free_pointer_table (pmd_t *ptable)  		 * move this descriptor to the front of the list, since  		 * it has one or more free tables.  		 */ -		list_del(dp); -		list_add(dp, &ptable_list); +		list_move(dp, &ptable_list);  	}  	return 0;  } diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index f04a1d25f1a2..97c7bfde8ae8 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -119,8 +119,7 @@ static inline int refill(void)  		if(hole->end == prev->start) {  			hole->size += prev->size;  			hole->end = prev->end; -			list_del(&(prev->list)); -			list_add(&(prev->list), &hole_cache); +			list_move(&(prev->list), &hole_cache);  			ret++;  		} @@ -182,8 +181,7 @@ static inline unsigned long get_baddr(int len, unsigned long align)  #endif  			return hole->end;  		} else if(hole->size == newlen) { -			list_del(&(hole->list)); -			list_add(&(hole->list), &hole_cache); +			list_move(&(hole->list), &hole_cache);  			dvma_entry_use(hole->start) = newlen;  #ifdef DVMA_DEBUG  			dvma_allocs++; diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 6c6980b9b6d4..8b6e723eb82b 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -472,38 +472,46 @@ config 4KSTACKS  	  running more threads on a system and also reduces the pressure  	  on the VM subsystem for higher order allocations. -choice -	prompt "RAM size" -	default AUTO - -config RAMAUTO -	bool "AUTO" -	---help--- -	  Configure the RAM size on your platform. Many platforms can auto -	  detect this, on those choose the AUTO option. Otherwise set the -	  RAM size you intend using. - -config RAM4MB -	bool "4MiB" -	help -	  Set RAM size to be 4MiB. - -config RAM8MB -	bool "8MiB" -	help -	  Set RAM size to be 8MiB. - -config RAM16MB -	bool "16MiB" -	help -	  Set RAM size to be 16MiB. - -config RAM32MB -	bool "32MiB" -	help -	  Set RAM size to be 32MiB. - -endchoice +comment "RAM configuration" + +config RAMBASE +	hex "Address of the base of RAM" +	default "0" +	help +	  Define the address that RAM starts at. On many platforms this is +	  0, the base of the address space. And this is the default. Some +	  platforms choose to setup their RAM at other addresses within the +	  processor address space. + +config RAMSIZE +	hex "Size of RAM (in bytes)" +	default "0x400000" +	help +	  Define the size of the system RAM. If you select 0 then the +	  kernel will try to probe the RAM size at runtime. This is not +	  supported on all CPU types. + +config VECTORBASE +	hex "Address of the base of system vectors" +	default "0" +	help +	  Define the address of the the system vectors. Commonly this is +	  put at the start of RAM, but it doesn't have to be. On ColdFire +	  platforms this address is programmed into the VBR register, thus +	  actually setting the address to use. + +config KERNELBASE +	hex "Address of the base of kernel code" +	default "0x400" +	help +	  Typically on m68k systems the kernel will not start at the base +	  of RAM, but usually some small offset from it. Define the start +	  address of the kernel here. The most common setup will have the +	  processor vectors at the base of RAM and then the start of the +	  kernel. On some platforms some RAM is reserved for boot loaders +	  and the kernel starts after that. The 0x400 default was based on +	  a system with the RAM based at address 0, and leaving enough room +	  for the theoretical maximum number of 256 vectors.  choice  	prompt "RAM bus width" @@ -511,7 +519,7 @@ choice  config RAMAUTOBIT  	bool "AUTO" -	---help--- +	help  	  Select the physical RAM data bus size. Not needed on most platforms,  	  so you can generally choose AUTO. @@ -545,7 +553,9 @@ config RAMKERNEL  config ROMKERNEL  	bool "ROM"  	help -	  The kernel will be resident in FLASH/ROM when running. +	  The kernel will be resident in FLASH/ROM when running. This is +	  often referred to as Execute-in-Place (XIP), since the kernel +	  code executes from the position it is stored in the FLASH/ROM.  endchoice diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index a331cc90797c..6a2f0c693254 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -1,7 +1,7 @@  /*   *	vmlinux.lds.S -- master linker script for m68knommu arch   * - *	(C) Copyright 2002-2004, Greg Ungerer <gerg@snapgear.com> + *	(C) Copyright 2002-2006, Greg Ungerer <gerg@snapgear.com>   *   *	This ends up looking compilcated, because of the number of   *	address variations for ram and rom/flash layouts. The real @@ -22,13 +22,7 @@  #define	ROM_START	0x10c10400  #define	ROM_LENGTH	0xfec00  #define	ROM_END		0x10d00000 -#define	RAMVEC_START	0x00000000 -#define	RAMVEC_LENGTH	0x400 -#define	RAM_START	0x10000400 -#define	RAM_LENGTH	0xffc00 -#define	RAM_END		0x10100000 -#define _ramend	_ram_end_notused -#define	DATA_ADDR	RAM_START +#define	DATA_ADDR	CONFIG_KERNELBASE  #endif  /* @@ -41,11 +35,6 @@  #define	ROM_START	0x10c10400  #define	ROM_LENGTH	0x1efc00  #define	ROM_END		0x10e00000 -#define	RAMVEC_START	0x00000000 -#define	RAMVEC_LENGTH	0x400 -#define	RAM_START	0x00020400 -#define	RAM_LENGTH	0x7dfc00 -#define	RAM_END		0x00800000  #endif  #ifdef CONFIG_ROMKERNEL  #define	ROMVEC_START	0x10c10000 @@ -53,11 +42,6 @@  #define	ROM_START	0x10c10400  #define	ROM_LENGTH	0x1efc00  #define	ROM_END		0x10e00000 -#define	RAMVEC_START	0x00000000 -#define	RAMVEC_LENGTH	0x400 -#define	RAM_START	0x00020000 -#define	RAM_LENGTH	0x600000 -#define	RAM_END		0x00800000  #endif  #ifdef CONFIG_HIMEMKERNEL  #define	ROMVEC_START	0x00600000 @@ -65,141 +49,28 @@  #define	ROM_START	0x00600400  #define	ROM_LENGTH	0x1efc00  #define	ROM_END		0x007f0000 -#define	RAMVEC_START	0x00000000 -#define	RAMVEC_LENGTH	0x400 -#define	RAM_START	0x00020000 -#define	RAM_LENGTH	0x5e0000 -#define	RAM_END		0x00600000  #endif  #endif -#ifdef CONFIG_DRAGEN2 -#define	RAM_START	0x10000 -#define	RAM_LENGTH	0x7f0000 -#endif -  #ifdef CONFIG_UCQUICC  #define	ROMVEC_START	0x00000000  #define	ROMVEC_LENGTH	0x404  #define	ROM_START	0x00000404  #define	ROM_LENGTH	0x1ff6fc  #define	ROM_END		0x00200000 -#define	RAMVEC_START	0x00200000 -#define	RAMVEC_LENGTH	0x404 -#define	RAM_START	0x00200404 -#define	RAM_LENGTH	0x1ff6fc -#define	RAM_END		0x00400000 -#endif - -/* - *	The standard Arnewsh 5206 board only has 1MiB of ram. Not normally - *	enough to be useful. Assume the user has fitted something larger, - *	at least 4MiB in size. No point in not letting the kernel completely - *	link, it will be obvious if it is too big when they go to load it. - */ -#if defined(CONFIG_ARN5206) -#define	RAM_START	0x10000 -#define	RAM_LENGTH	0x3f0000 -#endif - -/* - *	The Motorola 5206eLITE board only has 1MiB of static RAM. - */ -#if defined(CONFIG_ELITE) -#define	RAM_START	0x30020000 -#define	RAM_LENGTH	0xe0000 -#endif - -/* - *	All the Motorola eval boards have the same basic arrangement. - *	The end of RAM will vary depending on how much ram is fitted, - *	but this isn't important here, we assume at least 4MiB. - */ -#if defined(CONFIG_M5206eC3) || defined(CONFIG_M5249C3) || \ -    defined(CONFIG_M5272C3) || defined(CONFIG_M5307C3) || \ -    defined(CONFIG_ARN5307) || defined(CONFIG_M5407C3) || \ -    defined(CONFIG_M5271EVB) || defined(CONFIG_M5275EVB) || \ -    defined(CONFIG_M5235EVB) -#define	RAM_START	0x20000 -#define	RAM_LENGTH	0x3e0000 -#endif - -/* - *	The Freescale 5208EVB board has 32MB of RAM. - */ -#if defined(CONFIG_M5208EVB) -#define	RAM_START	0x40020000 -#define	RAM_LENGTH	0x01fe0000 -#endif - -/* - *	The senTec COBRA5272 board has nearly the same memory layout as  - *	the M5272C3. We assume 16MiB ram. - */ -#if defined(CONFIG_COBRA5272) -#define RAM_START   0x20000 -#define RAM_LENGTH  0xfe0000 -#endif - -#if defined(CONFIG_M5282EVB) -#define	RAM_START	0x10000 -#define	RAM_LENGTH	0x3f0000 -#endif - -/* - *	The senTec COBRA5282 board has the same memory layout as the M5282EVB. - */ -#if defined(CONFIG_COBRA5282) -#define  RAM_START   0x10000 -#define  RAM_LENGTH  0x3f0000 -#endif - - -/* - *	The EMAC SoM-5282EM module. - */ -#if defined(CONFIG_SOM5282EM) -#define  RAM_START   0x10000 -#define  RAM_LENGTH  0xff0000 -#endif - - -/* - *	These flash boot boards use all of ram for operation. Again the - *	actual memory size is not important here, assume at least 4MiB. - *	They currently have no support for running in flash. - */ -#if defined(CONFIG_NETtel) || defined(CONFIG_eLIA) || \ -    defined(CONFIG_DISKtel) || defined(CONFIG_SECUREEDGEMP3) || \ -    defined(CONFIG_HW_FEITH) -#define	RAM_START	0x400 -#define	RAM_LENGTH	0x3ffc00 -#endif - -/* - *	Sneha Boards mimimun memory - *	The end of RAM will vary depending on how much ram is fitted, - *	but this isn't important here, we assume at least 4MiB. - */ -#if defined(CONFIG_CPU16B)    -#define	RAM_START	0x20000 -#define	RAM_LENGTH	0x3e0000 -#endif - -#if defined(CONFIG_MOD5272) -#define RAM_START	0x02000000 -#define RAM_LENGTH	0x00800000 -#define RAMVEC_START	0x20000000 -#define RAMVEC_LENGTH	0x00000400  #endif  #if defined(CONFIG_RAMKERNEL) +#define	RAM_START	CONFIG_KERNELBASE +#define	RAM_LENGTH	(CONFIG_RAMBASE + CONFIG_RAMSIZE - CONFIG_KERNELBASE)  #define	TEXT		ram  #define	DATA		ram  #define	INIT		ram  #define	BSS		ram  #endif  #if defined(CONFIG_ROMKERNEL) || defined(CONFIG_HIMEMKERNEL) +#define	RAM_START	CONFIG_RAMBASE +#define	RAM_LENGTH	CONFIG_RAMSIZE  #define	TEXT		rom  #define	DATA		ram  #define	INIT		ram @@ -215,13 +86,7 @@ OUTPUT_ARCH(m68k)  ENTRY(_start)  MEMORY { -#ifdef RAMVEC_START -	ramvec	: ORIGIN = RAMVEC_START, LENGTH = RAMVEC_LENGTH -#endif  	ram	: ORIGIN = RAM_START, LENGTH = RAM_LENGTH -#ifdef RAM_END -	eram	: ORIGIN = RAM_END, LENGTH = 0 -#endif  #ifdef ROM_START  	romvec	: ORIGIN = ROMVEC_START, LENGTH = ROMVEC_LENGTH  	rom	: ORIGIN = ROM_START, LENGTH = ROM_LENGTH @@ -308,12 +173,6 @@ SECTIONS {  		__rom_end = . ;  	} > erom  #endif -#ifdef RAMVEC_START -	. = RAMVEC_START ; -	.ramvec : { -		__ramvec = .; -	} > ramvec -#endif  	.data DATA_ADDR : {  		. = ALIGN(4); @@ -373,12 +232,5 @@ SECTIONS {  		_ebss = . ;  	} > BSS -#ifdef RAM_END -	. = RAM_END ; -	.eram : { -		__ramend = . ; -		_ramend = . ; -	} > eram -#endif  } diff --git a/arch/m68knommu/platform/5307/head.S b/arch/m68knommu/platform/5307/head.S index c30c462b99b1..1d9eb301d7ac 100644 --- a/arch/m68knommu/platform/5307/head.S +++ b/arch/m68knommu/platform/5307/head.S @@ -3,7 +3,7 @@  /*   *	head.S -- common startup code for ColdFire CPUs.   * - *	(C) Copyright 1999-2004, Greg Ungerer (gerg@snapgear.com). + *	(C) Copyright 1999-2006, Greg Ungerer <gerg@snapgear.com>.   */  /*****************************************************************************/ @@ -19,47 +19,15 @@  /*****************************************************************************/  /* - *	Define fixed memory sizes. Configuration of a fixed memory size - *	overrides everything else. If the user defined a size we just - *	blindly use it (they know what they are doing right :-) - */ -#if defined(CONFIG_RAM32MB) -#define MEM_SIZE	0x02000000	/* memory size 32Mb */ -#elif defined(CONFIG_RAM16MB) -#define MEM_SIZE	0x01000000	/* memory size 16Mb */ -#elif defined(CONFIG_RAM8MB) -#define MEM_SIZE	0x00800000	/* memory size 8Mb */ -#elif defined(CONFIG_RAM4MB) -#define MEM_SIZE	0x00400000	/* memory size 4Mb */ -#elif defined(CONFIG_RAM1MB) -#define MEM_SIZE	0x00100000	/* memory size 1Mb */ -#endif - -/* - *	Memory size exceptions for special cases. Some boards may be set - *	for auto memory sizing, but we can't do it that way for some reason. - *	For example the 5206eLITE board has static RAM, and auto-detecting - *	the SDRAM will do you no good at all. Same goes for the MOD5272. - */ -#ifdef CONFIG_RAMAUTO -#if defined(CONFIG_M5206eLITE) -#define	MEM_SIZE	0x00100000	/* 1MiB default memory */ -#endif -#if defined(CONFIG_MOD5272) -#define MEM_SIZE	0x00800000	/* 8MiB default memory */ -#endif -#endif /* CONFIG_RAMAUTO */ - - -/* - *	If we don't have a fixed memory size now, then lets build in code + *	If we don't have a fixed memory size, then lets build in code   *	to auto detect the DRAM size. Obviously this is the prefered - *	method, and should work for most boards (it won't work for those - *	that do not have their RAM starting at address 0). + *	method, and should work for most boards. It won't work for those + *	that do not have their RAM starting at address 0, and it only + *	works on SDRAM (not boards fitted with SRAM).   */ -#if defined(MEM_SIZE) +#if CONFIG_RAMSIZE != 0  .macro GET_MEM_SIZE -	movel	#MEM_SIZE,%d0		/* hard coded memory size */ +	movel	#CONFIG_RAMSIZE,%d0	/* hard coded memory size */  .endm  #elif defined(CONFIG_M5206) || defined(CONFIG_M5206e) || \ @@ -98,37 +66,7 @@  .endm  #else -#error "ERROR: I don't know how to determine your boards memory size?" -#endif - - -/* - *	Most ColdFire boards have their DRAM starting at address 0. - *	Notable exception is the 5206eLITE board, another is the MOD5272. - */ -#if defined(CONFIG_M5206eLITE) -#define	MEM_BASE	0x30000000 -#endif -#if defined(CONFIG_MOD5272) -#define MEM_BASE	0x02000000 -#define VBR_BASE	0x20000000	/* vectors in SRAM */ -#endif -#if defined(CONFIG_M5208EVB) -#define MEM_BASE	0x40000000 -#endif - -#ifndef MEM_BASE -#define	MEM_BASE	0x00000000	/* memory base at address 0 */ -#endif - -/* - *	The default location for the vectors is at the base of RAM. - *	Some boards might like to use internal SRAM or something like - *	that. If no board specific header defines an alternative then - *	use the base of RAM. - */ -#ifndef	VBR_BASE -#define	VBR_BASE	MEM_BASE	/* vector address */ +#error "ERROR: I don't know how to probe your boards memory size?"  #endif  /*****************************************************************************/ @@ -191,11 +129,11 @@ _start:  	 *	Create basic memory configuration. Set VBR accordingly,  	 *	and size memory.  	 */ -	movel	#VBR_BASE,%a7 +	movel	#CONFIG_VECTORBASE,%a7  	movec   %a7,%VBR			/* set vectors addr */  	movel	%a7,_ramvec -	movel	#MEM_BASE,%a7			/* mark the base of RAM */ +	movel	#CONFIG_RAMBASE,%a7		/* mark the base of RAM */  	movel	%a7,_rambase  	GET_MEM_SIZE				/* macro code determines size */ diff --git a/arch/m68knommu/platform/68328/head-pilot.S b/arch/m68knommu/platform/68328/head-pilot.S index c46775fe04be..46b3604f999c 100644 --- a/arch/m68knommu/platform/68328/head-pilot.S +++ b/arch/m68knommu/platform/68328/head-pilot.S @@ -21,7 +21,6 @@  .global _start  .global _rambase -.global __ramvec  .global _ramvec  .global _ramstart  .global _ramend @@ -121,7 +120,7 @@ L0:  	DBG_PUTC('B')  	/* Copy command line from beginning of RAM (+16) to end of bss */ -	movel	#__ramvec, %d7 +	movel	#CONFIG_VECTORBASE, %d7  	addl	#16, %d7  	moveal	%d7, %a0  	moveal	#_ebss, %a1 diff --git a/arch/m68knommu/platform/68328/head-ram.S b/arch/m68knommu/platform/68328/head-ram.S index 6bdc9bce43f2..e8dc9241ff96 100644 --- a/arch/m68knommu/platform/68328/head-ram.S +++ b/arch/m68knommu/platform/68328/head-ram.S @@ -1,10 +1,7 @@  #include <linux/config.h>  	.global __main -	.global __ram_start -	.global __ram_end  	.global __rom_start -	.global __rom_end          .global _rambase          .global _ramstart @@ -12,6 +9,7 @@  	.global splash_bits  	.global _start  	.global _stext +	.global _edata  #define DEBUG  #define ROM_OFFSET 0x10C00000 @@ -73,7 +71,7 @@ pclp1:  #ifdef CONFIG_RELOCATE  	/* Copy me to RAM */  	moveal	#__rom_start, %a0 -	moveal	#__ram_start, %a1 +	moveal	#_stext, %a1  	moveal	#_edata, %a2  	/* Copy %a0 to %a1 until %a1 == %a2 */ diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c index c31e4cff64e0..65eb55400d77 100644 --- a/arch/mips/oprofile/common.c +++ b/arch/mips/oprofile/common.c @@ -38,7 +38,7 @@ static int op_mips_create_files(struct super_block * sb, struct dentry * root)  	for (i = 0; i < model->num_counters; ++i) {  		struct dentry *dir; -		char buf[3]; +		char buf[4];  		snprintf(buf, sizeof buf, "%d", i);  		dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d20907561f46..7dd5dab789a1 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(tb_ticks_per_sec);	/* for cputime_t conversions */  u64 tb_to_xs;  unsigned tb_to_us; -#define TICKLEN_SCALE	(SHIFT_SCALE - 10) +#define TICKLEN_SCALE	TICK_LENGTH_SHIFT  u64 last_tick_len;	/* units are ns / 2^TICKLEN_SCALE */  u64 ticklen_to_xs;	/* 0.64 fraction */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fdbba4206d59..a0a9e1e0061e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -40,6 +40,40 @@  #include <asm/kdebug.h>  #include <asm/siginfo.h> +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	struct die_args args = { +		.regs = regs, +		.str = str, +		.err = err, +		.trapnr = trap, +		.signr = sig +	}; +	return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	return NOTIFY_DONE; +} +#endif +  /*   * Check whether the instruction at regs->nip is a store using   * an update addressing form which will update r1. @@ -142,7 +176,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,  	is_write = error_code & ESR_DST;  #endif /* CONFIG_4xx || CONFIG_BOOKE */ -	if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, error_code, +	if (notify_page_fault(DIE_PAGE_FAULT, "page_fault", regs, error_code,  				11, SIGSEGV) == NOTIFY_STOP)  		return 0; diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c index 27ad56bd227e..fd0bbbe7a4de 100644 --- a/arch/powerpc/oprofile/common.c +++ b/arch/powerpc/oprofile/common.c @@ -94,7 +94,7 @@ static int op_powerpc_create_files(struct super_block *sb, struct dentry *root)  	for (i = 0; i < model->num_counters; ++i) {  		struct dentry *dir; -		char buf[3]; +		char buf[4];  		snprintf(buf, sizeof buf, "%d", i);  		dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index c5ca2dc5d428..5713c7e5bd16 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -37,10 +37,10 @@ struct s390_aes_ctx {  	int key_len;  }; -static int aes_set_key(void *ctx, const u8 *in_key, unsigned int key_len, -		       u32 *flags) +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +		       unsigned int key_len, u32 *flags)  { -	struct s390_aes_ctx *sctx = ctx; +	struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);  	switch (key_len) {  	case 16: @@ -70,9 +70,9 @@ fail:  	return -EINVAL;  } -static void aes_encrypt(void *ctx, u8 *out, const u8 *in) +static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)  { -	const struct s390_aes_ctx *sctx = ctx; +	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);  	switch (sctx->key_len) {  	case 16: @@ -90,9 +90,9 @@ static void aes_encrypt(void *ctx, u8 *out, const u8 *in)  	}  } -static void aes_decrypt(void *ctx, u8 *out, const u8 *in) +static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)  { -	const struct s390_aes_ctx *sctx = ctx; +	const struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);  	switch (sctx->key_len) {  	case 16: diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index e3c37aa0a199..b3f7496a79b4 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -44,10 +44,10 @@ struct crypt_s390_des3_192_ctx {  	u8 key[DES3_192_KEY_SIZE];  }; -static int des_setkey(void *ctx, const u8 *key, unsigned int keylen, -		      u32 *flags) +static int des_setkey(struct crypto_tfm *tfm, const u8 *key, +		      unsigned int keylen, u32 *flags)  { -	struct crypt_s390_des_ctx *dctx = ctx; +	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);  	int ret;  	/* test if key is valid (not a weak key) */ @@ -57,16 +57,16 @@ static int des_setkey(void *ctx, const u8 *key, unsigned int keylen,  	return ret;  } -static void des_encrypt(void *ctx, u8 *out, const u8 *in) +static void des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)  { -	struct crypt_s390_des_ctx *dctx = ctx; +	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, out, in, DES_BLOCK_SIZE);  } -static void des_decrypt(void *ctx, u8 *out, const u8 *in) +static void des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)  { -	struct crypt_s390_des_ctx *dctx = ctx; +	struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_DEA_DECRYPT, dctx->key, out, in, DES_BLOCK_SIZE);  } @@ -166,11 +166,11 @@ static struct crypto_alg des_alg = {   *   Implementers MUST reject keys that exhibit this property.   *   */ -static int des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen, -			   u32 *flags) +static int des3_128_setkey(struct crypto_tfm *tfm, const u8 *key, +			   unsigned int keylen, u32 *flags)  {  	int i, ret; -	struct crypt_s390_des3_128_ctx *dctx = ctx; +	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);  	const u8* temp_key = key;  	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE))) { @@ -186,17 +186,17 @@ static int des3_128_setkey(void *ctx, const u8 *key, unsigned int keylen,  	return 0;  } -static void des3_128_encrypt(void *ctx, u8 *dst, const u8 *src) +static void des3_128_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	struct crypt_s390_des3_128_ctx *dctx = ctx; +	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src,  		      DES3_128_BLOCK_SIZE);  } -static void des3_128_decrypt(void *ctx, u8 *dst, const u8 *src) +static void des3_128_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	struct crypt_s390_des3_128_ctx *dctx = ctx; +	struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src,  		      DES3_128_BLOCK_SIZE); @@ -302,11 +302,11 @@ static struct crypto_alg des3_128_alg = {   *   property.   *   */ -static int des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen, -			   u32 *flags) +static int des3_192_setkey(struct crypto_tfm *tfm, const u8 *key, +			   unsigned int keylen, u32 *flags)  {  	int i, ret; -	struct crypt_s390_des3_192_ctx *dctx = ctx; +	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);  	const u8* temp_key = key;  	if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && @@ -325,17 +325,17 @@ static int des3_192_setkey(void *ctx, const u8 *key, unsigned int keylen,  	return 0;  } -static void des3_192_encrypt(void *ctx, u8 *dst, const u8 *src) +static void des3_192_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	struct crypt_s390_des3_192_ctx *dctx = ctx; +	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src,  		      DES3_192_BLOCK_SIZE);  } -static void des3_192_decrypt(void *ctx, u8 *dst, const u8 *src) +static void des3_192_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)  { -	struct crypt_s390_des3_192_ctx *dctx = ctx; +	struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm);  	crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src,  		      DES3_192_BLOCK_SIZE); diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 98c896b86dcd..9d34a35b1aa5 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -40,28 +40,29 @@ struct crypt_s390_sha1_ctx {  	u8 buffer[2 * SHA1_BLOCK_SIZE];  }; -static void -sha1_init(void *ctx) +static void sha1_init(struct crypto_tfm *tfm)  { -	static const struct crypt_s390_sha1_ctx initstate = { -		.state = { -			0x67452301, -			0xEFCDAB89, -			0x98BADCFE, -			0x10325476, -			0xC3D2E1F0 -		}, +	struct crypt_s390_sha1_ctx *ctx = crypto_tfm_ctx(tfm); +	static const u32 initstate[5] = { +		0x67452301, +		0xEFCDAB89, +		0x98BADCFE, +		0x10325476, +		0xC3D2E1F0  	}; -	memcpy(ctx, &initstate, sizeof(initstate)); + +	ctx->count = 0; +	memcpy(ctx->state, &initstate, sizeof(initstate)); +	ctx->buf_len = 0;  } -static void -sha1_update(void *ctx, const u8 *data, unsigned int len) +static void sha1_update(struct crypto_tfm *tfm, const u8 *data, +			unsigned int len)  {  	struct crypt_s390_sha1_ctx *sctx;  	long imd_len; -	sctx = ctx; +	sctx = crypto_tfm_ctx(tfm);  	sctx->count += len * 8; //message bit length  	//anything in buffer yet? -> must be completed @@ -110,10 +111,9 @@ pad_message(struct crypt_s390_sha1_ctx* sctx)  }  /* Add padding and return the message digest. */ -static void -sha1_final(void* ctx, u8 *out) +static void sha1_final(struct crypto_tfm *tfm, u8 *out)  { -	struct crypt_s390_sha1_ctx *sctx = ctx; +	struct crypt_s390_sha1_ctx *sctx = crypto_tfm_ctx(tfm);  	//must perform manual padding  	pad_message(sctx); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index 1ec5e92b3454..f573df30f31d 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -31,9 +31,9 @@ struct s390_sha256_ctx {  	u8 buf[2 * SHA256_BLOCK_SIZE];  }; -static void sha256_init(void *ctx) +static void sha256_init(struct crypto_tfm *tfm)  { -	struct s390_sha256_ctx *sctx = ctx; +	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);  	sctx->state[0] = 0x6a09e667;  	sctx->state[1] = 0xbb67ae85; @@ -44,12 +44,12 @@ static void sha256_init(void *ctx)  	sctx->state[6] = 0x1f83d9ab;  	sctx->state[7] = 0x5be0cd19;  	sctx->count = 0; -	memset(sctx->buf, 0, sizeof(sctx->buf));  } -static void sha256_update(void *ctx, const u8 *data, unsigned int len) +static void sha256_update(struct crypto_tfm *tfm, const u8 *data, +			  unsigned int len)  { -	struct s390_sha256_ctx *sctx = ctx; +	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);  	unsigned int index;  	int ret; @@ -108,9 +108,9 @@ static void pad_message(struct s390_sha256_ctx* sctx)  }  /* Add padding and return the message digest */ -static void sha256_final(void* ctx, u8 *out) +static void sha256_final(struct crypto_tfm *tfm, u8 *out)  { -	struct s390_sha256_ctx *sctx = ctx; +	struct s390_sha256_ctx *sctx = crypto_tfm_ctx(tfm);  	/* must perform manual padding */  	pad_message(sctx); diff --git a/arch/sh/Makefile b/arch/sh/Makefile index c72e17a96eed..e467a450662b 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -147,7 +147,7 @@ endif  #	them changed.  We use .arch and .mach to indicate when they were  #	updated last, otherwise make uses the target directory mtime. -include/asm-sh/.cpu: $(wildcard include/config/cpu/*.h) include/config/MARKER +include/asm-sh/.cpu: $(wildcard include/config/cpu/*.h) include/config/auto.conf  	@echo '  SYMLINK include/asm-sh/cpu -> include/asm-sh/$(cpuincdir-y)'  	$(Q)if [ ! -d include/asm-sh ]; then mkdir -p include/asm-sh; fi  	$(Q)ln -fsn $(incdir-prefix)$(cpuincdir-y) include/asm-sh/cpu @@ -157,7 +157,7 @@ include/asm-sh/.cpu: $(wildcard include/config/cpu/*.h) include/config/MARKER  #	don't, just reference the parent directory so the semantics are  #	kept roughly the same. -include/asm-sh/.mach: $(wildcard include/config/sh/*.h) include/config/MARKER +include/asm-sh/.mach: $(wildcard include/config/sh/*.h) include/config/auto.conf  	@echo -n '  SYMLINK include/asm-sh/mach -> '  	$(Q)if [ ! -d include/asm-sh ]; then mkdir -p include/asm-sh; fi  	$(Q)if [ -d $(incdir-prefix)$(incdir-y) ]; then \ diff --git a/arch/sh/oprofile/op_model_sh7750.c b/arch/sh/oprofile/op_model_sh7750.c index 5ec9ddcc4b0b..c265185b22a7 100644 --- a/arch/sh/oprofile/op_model_sh7750.c +++ b/arch/sh/oprofile/op_model_sh7750.c @@ -198,7 +198,7 @@ static int sh7750_perf_counter_create_files(struct super_block *sb, struct dentr  	for (i = 0; i < NR_CNTRS; i++) {  		struct dentry *dir; -		char buf[3]; +		char buf[4];  		snprintf(buf, sizeof(buf), "%d", i);  		dir = oprofilefs_mkdir(sb, root, buf); diff --git a/arch/sparc/kernel/of_device.c b/arch/sparc/kernel/of_device.c index 001b8673b4bd..80a809478781 100644 --- a/arch/sparc/kernel/of_device.c +++ b/arch/sparc/kernel/of_device.c @@ -138,6 +138,7 @@ struct bus_type ebus_bus_type = {         .suspend	= of_device_suspend,         .resume	= of_device_resume,  }; +EXPORT_SYMBOL(ebus_bus_type);  #endif  #ifdef CONFIG_SBUS @@ -149,6 +150,7 @@ struct bus_type sbus_bus_type = {         .suspend	= of_device_suspend,         .resume	= of_device_resume,  }; +EXPORT_SYMBOL(sbus_bus_type);  #endif  static int __init of_bus_driver_init(void) diff --git a/arch/sparc/kernel/prom.c b/arch/sparc/kernel/prom.c index 63b2b9bd778e..946ce6d15819 100644 --- a/arch/sparc/kernel/prom.c +++ b/arch/sparc/kernel/prom.c @@ -27,6 +27,11 @@  static struct device_node *allnodes; +/* use when traversing tree through the allnext, child, sibling, + * or parent members of struct device_node. + */ +static DEFINE_RWLOCK(devtree_lock); +  int of_device_is_compatible(struct device_node *device, const char *compat)  {  	const char* cp; @@ -185,6 +190,54 @@ int of_getintprop_default(struct device_node *np, const char *name, int def)  }  EXPORT_SYMBOL(of_getintprop_default); +int of_set_property(struct device_node *dp, const char *name, void *val, int len) +{ +	struct property **prevp; +	void *new_val; +	int err; + +	new_val = kmalloc(len, GFP_KERNEL); +	if (!new_val) +		return -ENOMEM; + +	memcpy(new_val, val, len); + +	err = -ENODEV; + +	write_lock(&devtree_lock); +	prevp = &dp->properties; +	while (*prevp) { +		struct property *prop = *prevp; + +		if (!strcmp(prop->name, name)) { +			void *old_val = prop->value; +			int ret; + +			ret = prom_setprop(dp->node, name, val, len); +			err = -EINVAL; +			if (ret >= 0) { +				prop->value = new_val; +				prop->length = len; + +				if (OF_IS_DYNAMIC(prop)) +					kfree(old_val); + +				OF_MARK_DYNAMIC(prop); + +				err = 0; +			} +			break; +		} +		prevp = &(*prevp)->next; +	} +	write_unlock(&devtree_lock); + +	/* XXX Upate procfs if necessary... */ + +	return err; +} +EXPORT_SYMBOL(of_set_property); +  static unsigned int prom_early_allocated;  static void * __init prom_early_alloc(unsigned long size) @@ -354,7 +407,9 @@ static char * __init build_full_name(struct device_node *dp)  	return n;  } -static struct property * __init build_one_prop(phandle node, char *prev) +static unsigned int unique_id; + +static struct property * __init build_one_prop(phandle node, char *prev, char *special_name, void *special_val, int special_len)  {  	static struct property *tmp = NULL;  	struct property *p; @@ -364,25 +419,34 @@ static struct property * __init build_one_prop(phandle node, char *prev)  		p = tmp;  		memset(p, 0, sizeof(*p) + 32);  		tmp = NULL; -	} else +	} else {  		p = prom_early_alloc(sizeof(struct property) + 32); +		p->unique_id = unique_id++; +	}  	p->name = (char *) (p + 1); -	if (prev == NULL) { -		prom_firstprop(node, p->name); +	if (special_name) { +		p->length = special_len; +		p->value = prom_early_alloc(special_len); +		memcpy(p->value, special_val, special_len);  	} else { -		prom_nextprop(node, prev, p->name); -	} -	if (strlen(p->name) == 0) { -		tmp = p; -		return NULL; -	} -	p->length = prom_getproplen(node, p->name); -	if (p->length <= 0) { -		p->length = 0; -	} else { -		p->value = prom_early_alloc(p->length); -		len = prom_getproperty(node, p->name, p->value, p->length); +		if (prev == NULL) { +			prom_firstprop(node, p->name); +		} else { +			prom_nextprop(node, prev, p->name); +		} +		if (strlen(p->name) == 0) { +			tmp = p; +			return NULL; +		} +		p->length = prom_getproplen(node, p->name); +		if (p->length <= 0) { +			p->length = 0; +		} else { +			p->value = prom_early_alloc(p->length + 1); +			prom_getproperty(node, p->name, p->value, p->length); +			((unsigned char *)p->value)[p->length] = '\0'; +		}  	}  	return p;  } @@ -391,9 +455,14 @@ static struct property * __init build_prop_list(phandle node)  {  	struct property *head, *tail; -	head = tail = build_one_prop(node, NULL); +	head = tail = build_one_prop(node, NULL, +				     ".node", &node, sizeof(node)); + +	tail->next = build_one_prop(node, NULL, NULL, NULL, 0); +	tail = tail->next;  	while(tail) { -		tail->next = build_one_prop(node, tail->name); +		tail->next = build_one_prop(node, tail->name, +					    NULL, NULL, 0);  		tail = tail->next;  	} @@ -422,6 +491,7 @@ static struct device_node * __init create_node(phandle node)  		return NULL;  	dp = prom_early_alloc(sizeof(*dp)); +	dp->unique_id = unique_id++;  	kref_init(&dp->kref); diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index fa5006946062..5db7e1d85385 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile @@ -9,3 +9,5 @@ lib-y := mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \  	 strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \  	 copy_user.o locks.o atomic.o atomic32.o bitops.o \  	 lshrdi3.o ashldi3.o rwsem.o muldi3.o bitext.o + +obj-y += iomap.o diff --git a/arch/sparc/lib/iomap.c b/arch/sparc/lib/iomap.c new file mode 100644 index 000000000000..54501c1ca785 --- /dev/null +++ b/arch/sparc/lib/iomap.c @@ -0,0 +1,48 @@ +/* + * Implement the sparc iomap interfaces + */ +#include <linux/pci.h> +#include <linux/module.h> +#include <asm/io.h> + +/* Create a virtual mapping cookie for an IO port range */ +void __iomem *ioport_map(unsigned long port, unsigned int nr) +{ +	return (void __iomem *) (unsigned long) port; +} + +void ioport_unmap(void __iomem *addr) +{ +	/* Nothing to do */ +} +EXPORT_SYMBOL(ioport_map); +EXPORT_SYMBOL(ioport_unmap); + +/* Create a virtual mapping cookie for a PCI BAR (memory or IO) */ +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ +	unsigned long start = pci_resource_start(dev, bar); +	unsigned long len = pci_resource_len(dev, bar); +	unsigned long flags = pci_resource_flags(dev, bar); + +	if (!len || !start) +		return NULL; +	if (maxlen && len > maxlen) +		len = maxlen; +	if (flags & IORESOURCE_IO) +		return ioport_map(start, len); +	if (flags & IORESOURCE_MEM) { +		if (flags & IORESOURCE_CACHEABLE) +			return ioremap(start, len); +		return ioremap_nocache(start, len); +	} +	/* What? */ +	return NULL; +} + +void pci_iounmap(struct pci_dev *dev, void __iomem * addr) +{ +	/* nothing to do */ +} +EXPORT_SYMBOL(pci_iomap); +EXPORT_SYMBOL(pci_iounmap); diff --git a/arch/sparc64/kernel/auxio.c b/arch/sparc64/kernel/auxio.c index 2c42894b188f..c2c69c167d18 100644 --- a/arch/sparc64/kernel/auxio.c +++ b/arch/sparc64/kernel/auxio.c @@ -6,6 +6,7 @@   */  #include <linux/config.h> +#include <linux/module.h>  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/ioport.h> @@ -16,8 +17,8 @@  #include <asm/ebus.h>  #include <asm/auxio.h> -/* This cannot be static, as it is referenced in irq.c */  void __iomem *auxio_register = NULL; +EXPORT_SYMBOL(auxio_register);  enum auxio_type {  	AUXIO_TYPE_NODEV, diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 31e0fbb0d82c..cc89b06d0178 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -563,67 +563,6 @@ void handler_irq(int irq, struct pt_regs *regs)  	irq_exit();  } -#ifdef CONFIG_BLK_DEV_FD -extern irqreturn_t floppy_interrupt(int, void *, struct pt_regs *); - -/* XXX No easy way to include asm/floppy.h XXX */ -extern unsigned char *pdma_vaddr; -extern unsigned long pdma_size; -extern volatile int doing_pdma; -extern unsigned long fdc_status; - -irqreturn_t sparc_floppy_irq(int irq, void *dev_cookie, struct pt_regs *regs) -{ -	if (likely(doing_pdma)) { -		void __iomem *stat = (void __iomem *) fdc_status; -		unsigned char *vaddr = pdma_vaddr; -		unsigned long size = pdma_size; -		u8 val; - -		while (size) { -			val = readb(stat); -			if (unlikely(!(val & 0x80))) { -				pdma_vaddr = vaddr; -				pdma_size = size; -				return IRQ_HANDLED; -			} -			if (unlikely(!(val & 0x20))) { -				pdma_vaddr = vaddr; -				pdma_size = size; -				doing_pdma = 0; -				goto main_interrupt; -			} -			if (val & 0x40) { -				/* read */ -				*vaddr++ = readb(stat + 1); -			} else { -				unsigned char data = *vaddr++; - -				/* write */ -				writeb(data, stat + 1); -			} -			size--; -		} - -		pdma_vaddr = vaddr; -		pdma_size = size; - -		/* Send Terminal Count pulse to floppy controller. */ -		val = readb(auxio_register); -		val |= AUXIO_AUX1_FTCNT; -		writeb(val, auxio_register); -		val &= ~AUXIO_AUX1_FTCNT; -		writeb(val, auxio_register); - -		doing_pdma = 0; -	} - -main_interrupt: -	return floppy_interrupt(irq, dev_cookie, regs); -} -EXPORT_SYMBOL(sparc_floppy_irq); -#endif -  struct sun5_timer {  	u64	count0;  	u64	limit0; diff --git a/arch/sparc64/kernel/of_device.c b/arch/sparc64/kernel/of_device.c index 566aa343aa62..768475bbce82 100644 --- a/arch/sparc64/kernel/of_device.c +++ b/arch/sparc64/kernel/of_device.c @@ -138,6 +138,7 @@ struct bus_type isa_bus_type = {         .suspend	= of_device_suspend,         .resume	= of_device_resume,  }; +EXPORT_SYMBOL(isa_bus_type);  struct bus_type ebus_bus_type = {         .name	= "ebus", @@ -147,6 +148,7 @@ struct bus_type ebus_bus_type = {         .suspend	= of_device_suspend,         .resume	= of_device_resume,  }; +EXPORT_SYMBOL(ebus_bus_type);  #endif  #ifdef CONFIG_SBUS @@ -158,6 +160,7 @@ struct bus_type sbus_bus_type = {         .suspend	= of_device_suspend,         .resume	= of_device_resume,  }; +EXPORT_SYMBOL(sbus_bus_type);  #endif  static int __init of_bus_driver_init(void) diff --git a/arch/sparc64/kernel/prom.c b/arch/sparc64/kernel/prom.c index e9d703eea806..8e87e7ea0325 100644 --- a/arch/sparc64/kernel/prom.c +++ b/arch/sparc64/kernel/prom.c @@ -27,6 +27,11 @@  static struct device_node *allnodes; +/* use when traversing tree through the allnext, child, sibling, + * or parent members of struct device_node. + */ +static DEFINE_RWLOCK(devtree_lock); +  int of_device_is_compatible(struct device_node *device, const char *compat)  {  	const char* cp; @@ -185,6 +190,54 @@ int of_getintprop_default(struct device_node *np, const char *name, int def)  }  EXPORT_SYMBOL(of_getintprop_default); +int of_set_property(struct device_node *dp, const char *name, void *val, int len) +{ +	struct property **prevp; +	void *new_val; +	int err; + +	new_val = kmalloc(len, GFP_KERNEL); +	if (!new_val) +		return -ENOMEM; + +	memcpy(new_val, val, len); + +	err = -ENODEV; + +	write_lock(&devtree_lock); +	prevp = &dp->properties; +	while (*prevp) { +		struct property *prop = *prevp; + +		if (!strcmp(prop->name, name)) { +			void *old_val = prop->value; +			int ret; + +			ret = prom_setprop(dp->node, name, val, len); +			err = -EINVAL; +			if (ret >= 0) { +				prop->value = new_val; +				prop->length = len; + +				if (OF_IS_DYNAMIC(prop)) +					kfree(old_val); + +				OF_MARK_DYNAMIC(prop); + +				err = 0; +			} +			break; +		} +		prevp = &(*prevp)->next; +	} +	write_unlock(&devtree_lock); + +	/* XXX Upate procfs if necessary... */ + +	return err; +} +EXPORT_SYMBOL(of_set_property); +  static unsigned int prom_early_allocated;  static void * __init prom_early_alloc(unsigned long size) @@ -531,7 +584,9 @@ static char * __init build_full_name(struct device_node *dp)  	return n;  } -static struct property * __init build_one_prop(phandle node, char *prev) +static unsigned int unique_id; + +static struct property * __init build_one_prop(phandle node, char *prev, char *special_name, void *special_val, int special_len)  {  	static struct property *tmp = NULL;  	struct property *p; @@ -540,25 +595,35 @@ static struct property * __init build_one_prop(phandle node, char *prev)  		p = tmp;  		memset(p, 0, sizeof(*p) + 32);  		tmp = NULL; -	} else +	} else {  		p = prom_early_alloc(sizeof(struct property) + 32); +		p->unique_id = unique_id++; +	}  	p->name = (char *) (p + 1); -	if (prev == NULL) { -		prom_firstprop(node, p->name); +	if (special_name) { +		strcpy(p->name, special_name); +		p->length = special_len; +		p->value = prom_early_alloc(special_len); +		memcpy(p->value, special_val, special_len);  	} else { -		prom_nextprop(node, prev, p->name); -	} -	if (strlen(p->name) == 0) { -		tmp = p; -		return NULL; -	} -	p->length = prom_getproplen(node, p->name); -	if (p->length <= 0) { -		p->length = 0; -	} else { -		p->value = prom_early_alloc(p->length); -		prom_getproperty(node, p->name, p->value, p->length); +		if (prev == NULL) { +			prom_firstprop(node, p->name); +		} else { +			prom_nextprop(node, prev, p->name); +		} +		if (strlen(p->name) == 0) { +			tmp = p; +			return NULL; +		} +		p->length = prom_getproplen(node, p->name); +		if (p->length <= 0) { +			p->length = 0; +		} else { +			p->value = prom_early_alloc(p->length + 1); +			prom_getproperty(node, p->name, p->value, p->length); +			((unsigned char *)p->value)[p->length] = '\0'; +		}  	}  	return p;  } @@ -567,9 +632,14 @@ static struct property * __init build_prop_list(phandle node)  {  	struct property *head, *tail; -	head = tail = build_one_prop(node, NULL); +	head = tail = build_one_prop(node, NULL, +				     ".node", &node, sizeof(node)); + +	tail->next = build_one_prop(node, NULL, NULL, NULL, 0); +	tail = tail->next;  	while(tail) { -		tail->next = build_one_prop(node, tail->name); +		tail->next = build_one_prop(node, tail->name, +					    NULL, NULL, 0);  		tail = tail->next;  	} @@ -598,6 +668,7 @@ static struct device_node * __init create_node(phandle node)  		return NULL;  	dp = prom_early_alloc(sizeof(*dp)); +	dp->unique_id = unique_id++;  	kref_init(&dp->kref); diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 6e002aacb961..1605967cce91 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -31,6 +31,40 @@  #include <asm/kdebug.h>  #include <asm/mmu_context.h> +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	struct die_args args = { +		.regs = regs, +		.str = str, +		.err = err, +		.trapnr = trap, +		.signr = sig +	}; +	return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	return NOTIFY_DONE; +} +#endif +  /*   * To debug kernel to catch accesses to certain virtual/physical addresses.   * Mode = 0 selects physical watchpoints, mode = 1 selects virtual watchpoints. @@ -263,7 +297,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)  	fault_code = get_thread_fault_code(); -	if (notify_die(DIE_PAGE_FAULT, "page_fault", regs, +	if (notify_page_fault(DIE_PAGE_FAULT, "page_fault", regs,  		       fault_code, 0, SIGSEGV) == NOTIFY_STOP)  		return; diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 513993414747..5c2bcf354ce6 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1568,6 +1568,7 @@ pgprot_t PAGE_EXEC __read_mostly;  unsigned long pg_iobits __read_mostly;  unsigned long _PAGE_IE __read_mostly; +EXPORT_SYMBOL(_PAGE_IE);  unsigned long _PAGE_E __read_mostly;  EXPORT_SYMBOL(_PAGE_E); diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index af44130f0d65..ccc4a7fb97a3 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -386,24 +386,45 @@ config HPET_EMULATE_RTC  	bool "Provide RTC interrupt"  	depends on HPET_TIMER && RTC=y -config GART_IOMMU -	bool "K8 GART IOMMU support" +# Mark as embedded because too many people got it wrong. +# The code disables itself when not needed. +config IOMMU +	bool "IOMMU support" if EMBEDDED  	default y  	select SWIOTLB  	select AGP  	depends on PCI  	help -	  Support for hardware IOMMU in AMD's Opteron/Athlon64 Processors -	  and for the bounce buffering software IOMMU. -	  Needed to run systems with more than 3GB of memory properly with -	  32-bit PCI devices that do not support DAC (Double Address Cycle). -	  The IOMMU can be turned off at runtime with the iommu=off parameter. -  	  Normally the kernel will take the right choice by itself. -  	  This option includes a driver for the AMD Opteron/Athlon64 IOMMU -  	  northbridge and a software emulation used on other systems without -	  hardware IOMMU.  If unsure, say Y. - -# need this always selected by GART_IOMMU for the VIA workaround +	  Support for full DMA access of devices with 32bit memory access only +	  on systems with more than 3GB. This is usually needed for USB, +	  sound, many IDE/SATA chipsets and some other devices. +	  Provides a driver for the AMD Athlon64/Opteron/Turion/Sempron GART +	  based IOMMU and a software bounce buffer based IOMMU used on Intel +	  systems and as fallback. +	  The code is only active when needed (enough memory and limited +	  device) unless CONFIG_IOMMU_DEBUG or iommu=force is specified +	  too. + +config CALGARY_IOMMU +	bool "IBM Calgary IOMMU support" +	default y +	select SWIOTLB +	depends on PCI && EXPERIMENTAL +	help +	  Support for hardware IOMMUs in IBM's xSeries x366 and x460 +	  systems. Needed to run systems with more than 3GB of memory +	  properly with 32-bit PCI devices that do not support DAC +	  (Double Address Cycle). Calgary also supports bus level +	  isolation, where all DMAs pass through the IOMMU.  This +	  prevents them from going anywhere except their intended +	  destination. This catches hard-to-find kernel bugs and +	  mis-behaving drivers and devices that do not use the DMA-API +	  properly to set up their DMA buffers.  The IOMMU can be +	  turned off at boot time with the iommu=off parameter. +	  Normally the kernel will make the right choice by itself. +	  If unsure, say Y. + +# need this always selected by IOMMU for the VIA workaround  config SWIOTLB  	bool @@ -501,6 +522,10 @@ config REORDER           optimal TLB usage. If you have pretty much any version of binutils,   	 this can increase your kernel build time by roughly one minute. +config K8_NB +	def_bool y +	depends on AGP_AMD64 || IOMMU || (PCI && NUMA) +  endmenu  # diff --git a/arch/x86_64/Kconfig.debug b/arch/x86_64/Kconfig.debug index ea31b4c62105..1d92ab56c0f9 100644 --- a/arch/x86_64/Kconfig.debug +++ b/arch/x86_64/Kconfig.debug @@ -13,7 +13,7 @@ config DEBUG_RODATA  	 If in doubt, say "N".  config IOMMU_DEBUG -       depends on GART_IOMMU && DEBUG_KERNEL +       depends on IOMMU && DEBUG_KERNEL         bool "Enable IOMMU debugging"         help           Force the IOMMU to on even when you have less than 4GB of @@ -35,6 +35,22 @@ config IOMMU_LEAK           Add a simple leak tracer to the IOMMU code. This is useful when you  	 are debugging a buggy device driver that leaks IOMMU mappings. +config DEBUG_STACKOVERFLOW +        bool "Check for stack overflows" +        depends on DEBUG_KERNEL +        help +	  This option will cause messages to be printed if free stack space +	  drops below a certain limit. + +config DEBUG_STACK_USAGE +        bool "Stack utilization instrumentation" +        depends on DEBUG_KERNEL +        help +	  Enables the display of the minimum amount of free stack which each +	  task has ever had available in the sysrq-T and sysrq-P debug output. + +	  This option will slow down process creation somewhat. +  #config X86_REMOTE_DEBUG  #       bool "kgdb debugging stub" diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index e573e2ab5510..431bb4bc36cd 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -27,6 +27,7 @@ LDFLAGS_vmlinux :=  CHECKFLAGS      += -D__x86_64__ -m64  cflags-y	:= +cflags-kernel-y	:=  cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)  cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)  cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) @@ -35,7 +36,7 @@ cflags-y += -m64  cflags-y += -mno-red-zone  cflags-y += -mcmodel=kernel  cflags-y += -pipe -cflags-$(CONFIG_REORDER) += -ffunction-sections +cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections  # this makes reading assembly source easier, but produces worse code  # actually it makes the kernel smaller too.  cflags-y += -fno-reorder-blocks @@ -55,6 +56,7 @@ cflags-y += $(call cc-option,-funit-at-a-time)  cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)  CFLAGS += $(cflags-y) +CFLAGS_KERNEL += $(cflags-kernel-y)  AFLAGS += -m64  head-y := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index 43ee6c50c277..deb063e7762d 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -107,8 +107,13 @@ fdimage288: $(BOOTIMAGE) $(obj)/mtools.conf  isoimage: $(BOOTIMAGE)  	-rm -rf $(obj)/isoimage  	mkdir $(obj)/isoimage -	cp `echo /usr/lib*/syslinux/isolinux.bin | awk '{ print $1; }'` \ -		$(obj)/isoimage +	for i in lib lib64 share end ; do \ +		if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \ +			cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \ +			break ; \ +		fi ; \ +		if [ $$i = end ] ; then exit 1 ; fi ; \ +	done  	cp $(BOOTIMAGE) $(obj)/isoimage/linux  	echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg  	if [ -f '$(FDINITRD)' ] ; then \ diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index cf4b88c416dc..3755b2e394d0 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -77,11 +77,11 @@ static void gzip_release(void **);   */  static unsigned char *real_mode; /* Pointer to real-mode data */ -#define EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2)) +#define RM_EXT_MEM_K   (*(unsigned short *)(real_mode + 0x2))  #ifndef STANDARD_MEMORY_BIOS_CALL -#define ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0)) +#define RM_ALT_MEM_K   (*(unsigned long *)(real_mode + 0x1e0))  #endif -#define SCREEN_INFO (*(struct screen_info *)(real_mode+0)) +#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0))  extern unsigned char input_data[];  extern int input_len; @@ -92,9 +92,9 @@ static unsigned long output_ptr = 0;  static void *malloc(int size);  static void free(void *where); -  -void* memset(void* s, int c, unsigned n); -void* memcpy(void* dest, const void* src, unsigned n); + +static void *memset(void *s, int c, unsigned n); +static void *memcpy(void *dest, const void *src, unsigned n);  static void putstr(const char *); @@ -162,8 +162,8 @@ static void putstr(const char *s)  	int x,y,pos;  	char c; -	x = SCREEN_INFO.orig_x; -	y = SCREEN_INFO.orig_y; +	x = RM_SCREEN_INFO.orig_x; +	y = RM_SCREEN_INFO.orig_y;  	while ( ( c = *s++ ) != '\0' ) {  		if ( c == '\n' ) { @@ -184,8 +184,8 @@ static void putstr(const char *s)  		}  	} -	SCREEN_INFO.orig_x = x; -	SCREEN_INFO.orig_y = y; +	RM_SCREEN_INFO.orig_x = x; +	RM_SCREEN_INFO.orig_y = y;  	pos = (x + cols * y) * 2;	/* Update cursor position */  	outb_p(14, vidport); @@ -194,7 +194,7 @@ static void putstr(const char *s)  	outb_p(0xff & (pos >> 1), vidport+1);  } -void* memset(void* s, int c, unsigned n) +static void* memset(void* s, int c, unsigned n)  {  	int i;  	char *ss = (char*)s; @@ -203,7 +203,7 @@ void* memset(void* s, int c, unsigned n)  	return s;  } -void* memcpy(void* dest, const void* src, unsigned n) +static void* memcpy(void* dest, const void* src, unsigned n)  {  	int i;  	char *d = (char *)dest, *s = (char *)src; @@ -278,15 +278,15 @@ static void error(char *x)  	putstr(x);  	putstr("\n\n -- System halted"); -	while(1); +	while(1);	/* Halt */  } -void setup_normal_output_buffer(void) +static void setup_normal_output_buffer(void)  {  #ifdef STANDARD_MEMORY_BIOS_CALL -	if (EXT_MEM_K < 1024) error("Less than 2MB of memory"); +	if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory");  #else -	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory"); +	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory");  #endif  	output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */  	free_mem_end_ptr = (long)real_mode; @@ -297,13 +297,13 @@ struct moveparams {  	uch *high_buffer_start; int hcount;  }; -void setup_output_buffer_if_we_run_high(struct moveparams *mv) +static void setup_output_buffer_if_we_run_high(struct moveparams *mv)  {  	high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE);  #ifdef STANDARD_MEMORY_BIOS_CALL -	if (EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); +	if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory");  #else -	if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); +	if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory");  #endif	  	mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START;  	low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX @@ -319,7 +319,7 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)  	mv->high_buffer_start = high_buffer_start;  } -void close_output_buffer_if_we_run_high(struct moveparams *mv) +static void close_output_buffer_if_we_run_high(struct moveparams *mv)  {  	if (bytes_out > low_buffer_size) {  		mv->lcount = low_buffer_size; @@ -335,7 +335,7 @@ int decompress_kernel(struct moveparams *mv, void *rmode)  {  	real_mode = rmode; -	if (SCREEN_INFO.orig_video_mode == 7) { +	if (RM_SCREEN_INFO.orig_video_mode == 7) {  		vidmem = (char *) 0xb0000;  		vidport = 0x3b4;  	} else { @@ -343,8 +343,8 @@ int decompress_kernel(struct moveparams *mv, void *rmode)  		vidport = 0x3d4;  	} -	lines = SCREEN_INFO.orig_video_lines; -	cols = SCREEN_INFO.orig_video_cols; +	lines = RM_SCREEN_INFO.orig_video_lines; +	cols = RM_SCREEN_INFO.orig_video_cols;  	if (free_mem_ptr < 0x100000) setup_normal_output_buffer();  	else setup_output_buffer_if_we_run_high(mv); diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c index c44f5e2ec100..eae86691709a 100644 --- a/arch/x86_64/boot/tools/build.c +++ b/arch/x86_64/boot/tools/build.c @@ -149,10 +149,8 @@ int main(int argc, char ** argv)  	sz = sb.st_size;  	fprintf (stderr, "System is %d kB\n", sz/1024);  	sys_size = (sz + 15) / 16; -	/* 0x40000*16 = 4.0 MB, reasonable estimate for the current maximum */ -	if (sys_size > (is_big_kernel ? 0x40000 : DEF_SYSSIZE)) -		die("System is too big. Try using %smodules.", -			is_big_kernel ? "" : "bzImage or "); +	if (!is_big_kernel && sys_size > DEF_SYSSIZE) +		die("System is too big. Try using bzImage or modules.");  	while (sz > 0) {  		int l, n; diff --git a/arch/x86_64/boot/video.S b/arch/x86_64/boot/video.S index 32327bb37aff..2aa565c136e5 100644 --- a/arch/x86_64/boot/video.S +++ b/arch/x86_64/boot/video.S @@ -1929,6 +1929,7 @@ skip10:	movb	%ah, %al  	ret  store_edid: +#ifdef CONFIG_FIRMWARE_EDID  	pushw	%es				# just save all registers  	pushw	%ax  	pushw	%bx @@ -1946,6 +1947,22 @@ store_edid:  	rep  	stosl +	pushw   %es				# save ES +	xorw    %di, %di                        # Report Capability +	pushw   %di +	popw    %es                             # ES:DI must be 0:0 +	movw	$0x4f15, %ax +	xorw	%bx, %bx +	xorw	%cx, %cx +	int	$0x10 +	popw    %es                             # restore ES + +	cmpb    $0x00, %ah                      # call successful +	jne     no_edid + +	cmpb    $0x4f, %al                      # function supported +	jne     no_edid +  	movw	$0x4f15, %ax                    # do VBE/DDC  	movw	$0x01, %bx  	movw	$0x00, %cx @@ -1953,12 +1970,14 @@ store_edid:  	movw	$0x140, %di  	int	$0x10 +no_edid:  	popw	%di				# restore all registers  	popw	%dx  	popw	%cx  	popw	%bx  	popw	%ax  	popw	%es +#endif  	ret  # VIDEO_SELECT-only variables diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S index 483cbb23ab8d..26b40de4d0b0 100644 --- a/arch/x86_64/crypto/aes-x86_64-asm.S +++ b/arch/x86_64/crypto/aes-x86_64-asm.S @@ -15,6 +15,10 @@  .text +#include <asm/asm-offsets.h> + +#define BASE crypto_tfm_ctx_offset +  #define R1	%rax  #define R1E	%eax  #define R1X	%ax @@ -46,19 +50,19 @@  #define R10	%r10  #define R11	%r11 -#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ +#define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \  	.global	FUNC;			\  	.type	FUNC,@function;		\  	.align	8;			\  FUNC:	movq	r1,r2;			\  	movq	r3,r4;			\ -	leaq	BASE+52(r8),r9;		\ +	leaq	BASE+KEY+52(r8),r9;	\  	movq	r10,r11;		\  	movl	(r7),r5 ## E;		\  	movl	4(r7),r1 ## E;		\  	movl	8(r7),r6 ## E;		\  	movl	12(r7),r7 ## E;		\ -	movl	(r8),r10 ## E;		\ +	movl	BASE(r8),r10 ## E;	\  	xorl	-48(r9),r5 ## E;	\  	xorl	-44(r9),r1 ## E;	\  	xorl	-40(r9),r6 ## E;	\ @@ -128,8 +132,8 @@ FUNC:	movq	r1,r2;			\  	movl	r3 ## E,r1 ## E;	\  	movl	r4 ## E,r2 ## E; -#define entry(FUNC,BASE,B128,B192) \ -	prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) +#define entry(FUNC,KEY,B128,B192) \ +	prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)  #define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) @@ -147,9 +151,9 @@ FUNC:	movq	r1,r2;			\  #define decrypt_final(TAB,OFFSET) \  	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) -/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */ +/* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ -	entry(aes_encrypt,0,enc128,enc192) +	entry(aes_enc_blk,0,enc128,enc192)  	encrypt_round(aes_ft_tab,-96)  	encrypt_round(aes_ft_tab,-80)  enc192:	encrypt_round(aes_ft_tab,-64) @@ -166,9 +170,9 @@ enc128:	encrypt_round(aes_ft_tab,-32)  	encrypt_final(aes_fl_tab,112)  	return -/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */ +/* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ -	entry(aes_decrypt,240,dec128,dec192) +	entry(aes_dec_blk,240,dec128,dec192)  	decrypt_round(aes_it_tab,-96)  	decrypt_round(aes_it_tab,-80)  dec192:	decrypt_round(aes_it_tab,-64) diff --git a/arch/x86_64/crypto/aes.c b/arch/x86_64/crypto/aes.c index 6f77e7700d32..68866fab37aa 100644 --- a/arch/x86_64/crypto/aes.c +++ b/arch/x86_64/crypto/aes.c @@ -227,10 +227,10 @@ static void __init gen_tabs(void)  	t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;	\  } -static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len, -		       u32 *flags) +static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, +		       unsigned int key_len, u32 *flags)  { -	struct aes_ctx *ctx = ctx_arg; +	struct aes_ctx *ctx = crypto_tfm_ctx(tfm);  	const __le32 *key = (const __le32 *)in_key;  	u32 i, j, t, u, v, w; @@ -283,8 +283,18 @@ static int aes_set_key(void *ctx_arg, const u8 *in_key, unsigned int key_len,  	return 0;  } -extern void aes_encrypt(void *ctx_arg, u8 *out, const u8 *in); -extern void aes_decrypt(void *ctx_arg, u8 *out, const u8 *in); +asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); +asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in); + +static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ +	aes_enc_blk(tfm, dst, src); +} + +static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ +	aes_dec_blk(tfm, dst, src); +}  static struct crypto_alg aes_alg = {  	.cra_name		=	"aes", diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 69db0c0721d1..e69d403949c8 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@  #  # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17-rc1-git11 -# Sun Apr 16 07:22:36 2006 +# Linux kernel version: 2.6.17-git6 +# Sat Jun 24 00:52:28 2006  #  CONFIG_X86_64=y  CONFIG_64BIT=y @@ -42,7 +42,6 @@ CONFIG_IKCONFIG_PROC=y  # CONFIG_RELAY is not set  CONFIG_INITRAMFS_SOURCE=""  CONFIG_UID16=y -CONFIG_VM86=y  CONFIG_CC_OPTIMIZE_FOR_SIZE=y  # CONFIG_EMBEDDED is not set  CONFIG_KALLSYMS=y @@ -57,7 +56,6 @@ CONFIG_FUTEX=y  CONFIG_EPOLL=y  CONFIG_SHMEM=y  CONFIG_SLAB=y -CONFIG_DOUBLEFAULT=y  # CONFIG_TINY_SHMEM is not set  CONFIG_BASE_SMALL=0  # CONFIG_SLOB is not set @@ -144,7 +142,8 @@ CONFIG_NR_CPUS=32  CONFIG_HOTPLUG_CPU=y  CONFIG_HPET_TIMER=y  CONFIG_HPET_EMULATE_RTC=y -CONFIG_GART_IOMMU=y +CONFIG_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set  CONFIG_SWIOTLB=y  CONFIG_X86_MCE=y  CONFIG_X86_MCE_INTEL=y @@ -158,6 +157,7 @@ CONFIG_HZ_250=y  # CONFIG_HZ_1000 is not set  CONFIG_HZ=250  # CONFIG_REORDER is not set +CONFIG_K8_NB=y  CONFIG_GENERIC_HARDIRQS=y  CONFIG_GENERIC_IRQ_PROBE=y  CONFIG_ISA_DMA_API=y @@ -293,6 +293,8 @@ CONFIG_IP_PNP_DHCP=y  # CONFIG_INET_IPCOMP is not set  # CONFIG_INET_XFRM_TUNNEL is not set  # CONFIG_INET_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set  CONFIG_INET_DIAG=y  CONFIG_INET_TCP_DIAG=y  # CONFIG_TCP_CONG_ADVANCED is not set @@ -305,7 +307,10 @@ CONFIG_IPV6=y  # CONFIG_INET6_IPCOMP is not set  # CONFIG_INET6_XFRM_TUNNEL is not set  # CONFIG_INET6_TUNNEL is not set +# CONFIG_INET6_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET6_XFRM_MODE_TUNNEL is not set  # CONFIG_IPV6_TUNNEL is not set +# CONFIG_NETWORK_SECMARK is not set  # CONFIG_NETFILTER is not set  # @@ -344,6 +349,7 @@ CONFIG_IPV6=y  # Network testing  #  # CONFIG_NET_PKTGEN is not set +# CONFIG_NET_TCPPROBE is not set  # CONFIG_HAMRADIO is not set  # CONFIG_IRDA is not set  # CONFIG_BT is not set @@ -360,6 +366,7 @@ CONFIG_STANDALONE=y  CONFIG_PREVENT_FIRMWARE_BUILD=y  CONFIG_FW_LOADER=y  # CONFIG_DEBUG_DRIVER is not set +# CONFIG_SYS_HYPERVISOR is not set  #  # Connector - unified userspace <-> kernelspace linker @@ -526,6 +533,7 @@ CONFIG_SCSI_ATA_PIIX=y  # CONFIG_SCSI_SATA_MV is not set  CONFIG_SCSI_SATA_NV=y  # CONFIG_SCSI_PDC_ADMA is not set +# CONFIG_SCSI_HPTIOP is not set  # CONFIG_SCSI_SATA_QSTOR is not set  # CONFIG_SCSI_SATA_PROMISE is not set  # CONFIG_SCSI_SATA_SX4 is not set @@ -591,10 +599,7 @@ CONFIG_IEEE1394=y  #  # Device Drivers  # - -# -# Texas Instruments PCILynx requires I2C -# +# CONFIG_IEEE1394_PCILYNX is not set  CONFIG_IEEE1394_OHCI1394=y  # @@ -645,7 +650,16 @@ CONFIG_VORTEX=y  #  # Tulip family network device support  # -# CONFIG_NET_TULIP is not set +CONFIG_NET_TULIP=y +# CONFIG_DE2104X is not set +CONFIG_TULIP=y +# CONFIG_TULIP_MWI is not set +# CONFIG_TULIP_MMIO is not set +# CONFIG_TULIP_NAPI is not set +# CONFIG_DE4X5 is not set +# CONFIG_WINBOND_840 is not set +# CONFIG_DM9102 is not set +# CONFIG_ULI526X is not set  # CONFIG_HP100 is not set  CONFIG_NET_PCI=y  # CONFIG_PCNET32 is not set @@ -697,6 +711,7 @@ CONFIG_TIGON3=y  # CONFIG_IXGB is not set  CONFIG_S2IO=m  # CONFIG_S2IO_NAPI is not set +# CONFIG_MYRI10GE is not set  #  # Token Ring devices @@ -887,7 +902,56 @@ CONFIG_HPET_MMAP=y  #  # I2C support  # -# CONFIG_I2C is not set +CONFIG_I2C=m +CONFIG_I2C_CHARDEV=m + +# +# I2C Algorithms +# +# CONFIG_I2C_ALGOBIT is not set +# CONFIG_I2C_ALGOPCF is not set +# CONFIG_I2C_ALGOPCA is not set + +# +# I2C Hardware Bus support +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +CONFIG_I2C_ISA=m +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PROSAVAGE is not set +# CONFIG_I2C_SAVAGE4 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_STUB is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +# CONFIG_I2C_PCA_ISA is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_SENSORS_DS1337 is not set +# CONFIG_SENSORS_DS1374 is not set +# CONFIG_SENSORS_EEPROM is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set  #  # SPI support @@ -898,14 +962,51 @@ CONFIG_HPET_MMAP=y  #  # Dallas's 1-wire bus  # -# CONFIG_W1 is not set  #  # Hardware Monitoring support  #  CONFIG_HWMON=y  # CONFIG_HWMON_VID is not set +# CONFIG_SENSORS_ABITUGURU is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_DS1621 is not set  # CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_FSCHER is not set +# CONFIG_SENSORS_FSCPOS is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM63 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +CONFIG_SENSORS_SMSC47B397=m +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set  # CONFIG_SENSORS_HDAPS is not set  # CONFIG_HWMON_DEBUG_CHIP is not set @@ -918,6 +1019,7 @@ CONFIG_HWMON=y  # Multimedia devices  #  # CONFIG_VIDEO_DEV is not set +CONFIG_VIDEO_V4L2=y  #  # Digital Video Broadcasting Devices @@ -953,28 +1055,17 @@ CONFIG_SOUND=y  # Open Sound System  #  CONFIG_SOUND_PRIME=y -CONFIG_OBSOLETE_OSS_DRIVER=y  # CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_CMPCI is not set  # CONFIG_SOUND_EMU10K1 is not set  # CONFIG_SOUND_FUSION is not set -# CONFIG_SOUND_CS4281 is not set -# CONFIG_SOUND_ES1370 is not set  # CONFIG_SOUND_ES1371 is not set -# CONFIG_SOUND_ESSSOLO1 is not set -# CONFIG_SOUND_MAESTRO is not set -# CONFIG_SOUND_MAESTRO3 is not set  CONFIG_SOUND_ICH=y -# CONFIG_SOUND_SONICVIBES is not set  # CONFIG_SOUND_TRIDENT is not set  # CONFIG_SOUND_MSNDCLAS is not set  # CONFIG_SOUND_MSNDPIN is not set  # CONFIG_SOUND_VIA82CXXX is not set  # CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_ALI5455 is not set -# CONFIG_SOUND_FORTE is not set -# CONFIG_SOUND_RME96XX is not set -# CONFIG_SOUND_AD1980 is not set +# CONFIG_SOUND_TVMIXER is not set  #  # USB support @@ -1000,6 +1091,7 @@ CONFIG_USB_DEVICEFS=y  CONFIG_USB_EHCI_HCD=y  # CONFIG_USB_EHCI_SPLIT_ISO is not set  # CONFIG_USB_EHCI_ROOT_HUB_TT is not set +# CONFIG_USB_EHCI_TT_NEWSCHED is not set  # CONFIG_USB_ISP116X_HCD is not set  CONFIG_USB_OHCI_HCD=y  # CONFIG_USB_OHCI_BIG_ENDIAN is not set @@ -1089,10 +1181,12 @@ CONFIG_USB_MON=y  # CONFIG_USB_LEGOTOWER is not set  # CONFIG_USB_LCD is not set  # CONFIG_USB_LED is not set +# CONFIG_USB_CY7C63 is not set  # CONFIG_USB_CYTHERM is not set  # CONFIG_USB_PHIDGETKIT is not set  # CONFIG_USB_PHIDGETSERVO is not set  # CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_APPLEDISPLAY is not set  # CONFIG_USB_SISUSBVGA is not set  # CONFIG_USB_LD is not set  # CONFIG_USB_TEST is not set @@ -1141,6 +1235,19 @@ CONFIG_USB_MON=y  # CONFIG_RTC_CLASS is not set  # +# DMA Engine support +# +# CONFIG_DMA_ENGINE is not set + +# +# DMA Clients +# + +# +# DMA Devices +# + +#  # Firmware Drivers  #  # CONFIG_EDD is not set @@ -1175,6 +1282,7 @@ CONFIG_FS_POSIX_ACL=y  # CONFIG_MINIX_FS is not set  # CONFIG_ROMFS_FS is not set  CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y  # CONFIG_QUOTA is not set  CONFIG_DNOTIFY=y  CONFIG_AUTOFS_FS=y @@ -1331,7 +1439,8 @@ CONFIG_DETECT_SOFTLOCKUP=y  CONFIG_DEBUG_FS=y  # CONFIG_DEBUG_VM is not set  # CONFIG_FRAME_POINTER is not set -# CONFIG_UNWIND_INFO is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y  # CONFIG_FORCED_INLINING is not set  # CONFIG_RCU_TORTURE_TEST is not set  # CONFIG_DEBUG_RODATA is not set diff --git a/arch/x86_64/ia32/fpu32.c b/arch/x86_64/ia32/fpu32.c index 1c23095f1813..2c8209a3605a 100644 --- a/arch/x86_64/ia32/fpu32.c +++ b/arch/x86_64/ia32/fpu32.c @@ -2,7 +2,6 @@   * Copyright 2002 Andi Kleen, SuSE Labs.   * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes.   * This is used for ptrace, signals and coredumps in 32bit emulation. - * $Id: fpu32.c,v 1.1 2002/03/21 14:16:32 ak Exp $   */   #include <linux/sched.h> diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index e0a92439f634..25e5ca22204c 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -6,8 +6,6 @@   *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes   *  2000-12-*   x86-64 compatibility mode signal handling by Andi Kleen - *  - *  $Id: ia32_signal.c,v 1.22 2002/07/29 10:34:03 ak Exp $   */  #include <linux/sched.h> diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 4ec594ab1a98..c536fa98ea37 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -155,6 +155,7 @@ sysenter_tracesys:  	.previous  	jmp	sysenter_do_call  	CFI_ENDPROC +ENDPROC(ia32_sysenter_target)  /*   * 32bit SYSCALL instruction entry. @@ -178,7 +179,7 @@ sysenter_tracesys:   */ 	  ENTRY(ia32_cstar_target)  	CFI_STARTPROC32	simple -	CFI_DEF_CFA	rsp,0 +	CFI_DEF_CFA	rsp,PDA_STACKOFFSET  	CFI_REGISTER	rip,rcx  	/*CFI_REGISTER	rflags,r11*/  	swapgs @@ -249,6 +250,7 @@ cstar_tracesys:  	.quad 1b,ia32_badarg  	.previous  	jmp cstar_do_call +END(ia32_cstar_target)  ia32_badarg:  	movq $-EFAULT,%rax @@ -314,16 +316,13 @@ ia32_tracesys:  	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */  	RESTORE_REST  	jmp ia32_do_syscall +END(ia32_syscall)  ia32_badsys:  	movq $0,ORIG_RAX-ARGOFFSET(%rsp)  	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)  	jmp int_ret_from_sys_call -ni_syscall: -	movq %rax,%rdi -	jmp  sys32_ni_syscall			 -  quiet_ni_syscall:  	movq $-ENOSYS,%rax  	ret @@ -370,10 +369,10 @@ ENTRY(ia32_ptregs_common)  	RESTORE_REST  	jmp  ia32_sysret	/* misbalances the return cache */  	CFI_ENDPROC +END(ia32_ptregs_common)  	.section .rodata,"a"  	.align 8 -	.globl ia32_sys_call_table  ia32_sys_call_table:  	.quad sys_restart_syscall  	.quad sys_exit diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 23a4515a73b4..a590b7a0d92d 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -7,8 +7,6 @@   *    * This allows to access 64bit processes too; but there is no way to see the extended    * register contents. - * - * $Id: ptrace32.c,v 1.16 2003/03/14 16:06:35 ak Exp $   */   #include <linux/kernel.h> @@ -27,6 +25,7 @@  #include <asm/debugreg.h>  #include <asm/i387.h>  #include <asm/fpu32.h> +#include <asm/ia32.h>  /*   * Determines which flags the user has access to [1 = access, 0 = no access]. @@ -199,6 +198,24 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val)  #undef R32 +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ +	int ret; +	compat_siginfo_t *si32 = (compat_siginfo_t *)compat_ptr(data); +	siginfo_t *si = compat_alloc_user_space(sizeof(siginfo_t)); +	if (request == PTRACE_SETSIGINFO) { +		ret = copy_siginfo_from_user32(si, si32); +		if (ret) +			return ret; +	} +	ret = sys_ptrace(request, pid, addr, (unsigned long)si); +	if (ret) +		return ret; +	if (request == PTRACE_GETSIGINFO) +		ret = copy_siginfo_to_user32(si32, si); +	return ret; +} +  asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)  {  	struct task_struct *child; @@ -208,9 +225,19 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)  	__u32 val;  	switch (request) {  -	default: +	case PTRACE_TRACEME: +	case PTRACE_ATTACH: +	case PTRACE_KILL: +	case PTRACE_CONT: +	case PTRACE_SINGLESTEP: +	case PTRACE_DETACH: +	case PTRACE_SYSCALL: +	case PTRACE_SETOPTIONS:  		return sys_ptrace(request, pid, addr, data);  +	default: +		return -EINVAL; +  	case PTRACE_PEEKTEXT:  	case PTRACE_PEEKDATA:  	case PTRACE_POKEDATA: @@ -225,10 +252,11 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)  	case PTRACE_GETFPXREGS:  	case PTRACE_GETEVENTMSG:  		break; -	}  -	if (request == PTRACE_TRACEME) -		return ptrace_traceme(); +	case PTRACE_SETSIGINFO: +	case PTRACE_GETSIGINFO: +		return ptrace32_siginfo(request, pid, addr, data); +	}  	child = ptrace_get_task_struct(pid);  	if (IS_ERR(child)) @@ -349,8 +377,7 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)  		break;  	default: -		ret = -EINVAL; -		break; +		BUG();  	}   out: diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index f182b20858e2..dc88154c412b 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -508,19 +508,6 @@ sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options)  	return compat_sys_wait4(pid, stat_addr, options, NULL);  } -int sys32_ni_syscall(int call) -{  -	struct task_struct *me = current; -	static char lastcomm[sizeof(me->comm)]; - -	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { -		printk(KERN_INFO "IA32 syscall %d from %s not implemented\n", -		       call, me->comm); -		strncpy(lastcomm, me->comm, sizeof(lastcomm)); -	}  -	return -ENOSYS;	        -}  -  /* 32-bit timeval and related flotsam.  */  asmlinkage long @@ -916,7 +903,7 @@ long sys32_vm86_warning(void)  	struct task_struct *me = current;  	static char lastcomm[sizeof(me->comm)];  	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { -		printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", +		compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n",  		       me->comm);  		strncpy(lastcomm, me->comm, sizeof(lastcomm));  	}  @@ -929,13 +916,3 @@ long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,  	return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);  } -static int __init ia32_init (void) -{ -	printk("IA32 emulation $Id: sys_ia32.c,v 1.32 2002/03/24 13:02:28 ak Exp $\n");   -	return 0; -} - -__initcall(ia32_init); - -extern unsigned long ia32_sys_call_table[]; -EXPORT_SYMBOL(ia32_sys_call_table); diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 059c88313f4e..aeb9c560be88 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y	:= process.o signal.o entry.o traps.o irq.o \  		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \  		x8664_ksyms.o i387.o syscall.o vsyscall.o \  		setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ -		pci-dma.o pci-nommu.o +		pci-dma.o pci-nommu.o alternative.o  obj-$(CONFIG_X86_MCE)         += mce.o  obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o @@ -28,11 +28,13 @@ obj-$(CONFIG_PM)		+= suspend.o  obj-$(CONFIG_SOFTWARE_SUSPEND)	+= suspend_asm.o  obj-$(CONFIG_CPU_FREQ)		+= cpufreq/  obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o -obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o +obj-$(CONFIG_IOMMU)		+= pci-gart.o aperture.o +obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary.o tce.o  obj-$(CONFIG_SWIOTLB)		+= pci-swiotlb.o  obj-$(CONFIG_KPROBES)		+= kprobes.o  obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o  obj-$(CONFIG_X86_VSMP)		+= vsmp.o +obj-$(CONFIG_K8_NB)		+= k8.o  obj-$(CONFIG_MODULES)		+= module.o @@ -49,3 +51,5 @@ intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o  quirks-y			+= ../../i386/kernel/quirks.o  i8237-y				+= ../../i386/kernel/i8237.o  msr-$(subst m,y,$(CONFIG_X86_MSR))  += ../../i386/kernel/msr.o +alternative-y			+= ../../i386/kernel/alternative.o + diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index 70b9d21ed675..a195ef06ec55 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -8,7 +8,6 @@   * because only the bootmem allocator can allocate 32+MB.    *    * Copyright 2002 Andi Kleen, SuSE Labs. - * $Id: aperture.c,v 1.7 2003/08/01 03:36:18 ak Exp $   */  #include <linux/config.h>  #include <linux/kernel.h> @@ -24,6 +23,7 @@  #include <asm/proto.h>  #include <asm/pci-direct.h>  #include <asm/dma.h> +#include <asm/k8.h>  int iommu_aperture;  int iommu_aperture_disabled __initdata = 0; @@ -37,8 +37,6 @@ int fix_aperture __initdata = 1;  /* This code runs before the PCI subsystem is initialized, so just     access the northbridge directly. */ -#define NB_ID_3 (PCI_VENDOR_ID_AMD | (0x1103<<16)) -  static u32 __init allocate_aperture(void)   {  	pg_data_t *nd0 = NODE_DATA(0); @@ -68,20 +66,20 @@ static u32 __init allocate_aperture(void)  	return (u32)__pa(p);   } -static int __init aperture_valid(char *name, u64 aper_base, u32 aper_size)  +static int __init aperture_valid(u64 aper_base, u32 aper_size)  {   	if (!aper_base)   		return 0;  	if (aper_size < 64*1024*1024) {  -		printk("Aperture from %s too small (%d MB)\n", name, aper_size>>20);  +		printk("Aperture too small (%d MB)\n", aper_size>>20);  		return 0;  	}  	if (aper_base + aper_size >= 0xffffffff) {  -		printk("Aperture from %s beyond 4GB. Ignoring.\n",name); +		printk("Aperture beyond 4GB. Ignoring.\n");  		return 0;   	}  	if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { -		printk("Aperture from %s pointing to e820 RAM. Ignoring.\n",name); +		printk("Aperture pointing to e820 RAM. Ignoring.\n");  		return 0;   	}   	return 1; @@ -140,7 +138,7 @@ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order)  	printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n",   	       aper, 32 << *order, apsizereg); -	if (!aperture_valid("AGP bridge", aper, (32*1024*1024) << *order)) +	if (!aperture_valid(aper, (32*1024*1024) << *order))  	    return 0;  	return (u32)aper;   }  @@ -208,10 +206,10 @@ void __init iommu_hole_init(void)  	fix = 0;  	for (num = 24; num < 32; num++) {		 -		char name[30]; -		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)  -			continue;	 +		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) +			continue; +		iommu_detected = 1;  		iommu_aperture = 1;   		aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7;  @@ -222,9 +220,7 @@ void __init iommu_hole_init(void)  		printk("CPU %d: aperture @ %Lx size %u MB\n", num-24,   		       aper_base, aper_size>>20); -		sprintf(name, "northbridge cpu %d", num-24);  - -		if (!aperture_valid(name, aper_base, aper_size)) {  +		if (!aperture_valid(aper_base, aper_size)) {  			fix = 1;   			break;   		} @@ -273,7 +269,7 @@ void __init iommu_hole_init(void)  	/* Fix up the north bridges */  	for (num = 24; num < 32; num++) { 		 -		if (read_pci_config(0, num, 3, 0x00) != NB_ID_3)  +		if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00)))  			continue;	  		/* Don't enable translation yet. That is done later.  diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 29ef99001e05..b2ead91df218 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -100,7 +100,7 @@ void clear_local_APIC(void)  	maxlvt = get_maxlvt();  	/* -	 * Masking an LVT entry on a P6 can trigger a local APIC error +	 * Masking an LVT entry can trigger a local APIC error  	 * if the vector is zero. Mask LVTERR first to prevent this.  	 */  	if (maxlvt >= 3) { @@ -851,7 +851,18 @@ void disable_APIC_timer(void)  		unsigned long v;  		v = apic_read(APIC_LVTT); -		apic_write(APIC_LVTT, v | APIC_LVT_MASKED); +		/* +		 * When an illegal vector value (0-15) is written to an LVT +		 * entry and delivery mode is Fixed, the APIC may signal an +		 * illegal vector error, with out regard to whether the mask +		 * bit is set or whether an interrupt is actually seen on input. +		 * +		 * Boot sequence might call this function when the LVTT has +		 * '0' vector value. So make sure vector field is set to +		 * valid value. +		 */ +		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); +		apic_write(APIC_LVTT, v);  	}  } @@ -909,15 +920,13 @@ int setup_profiling_timer(unsigned int multiplier)  	return -EINVAL;  } -#ifdef CONFIG_X86_MCE_AMD -void setup_threshold_lvt(unsigned long lvt_off) +void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, +			    unsigned char msg_type, unsigned char mask)  { -	unsigned int v = 0; -	unsigned long reg = (lvt_off << 4) + 0x500; -	v |= THRESHOLD_APIC_VECTOR; +	unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; +	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;  	apic_write(reg, v);  } -#endif /* CONFIG_X86_MCE_AMD */  #undef APIC_DIVISOR @@ -983,7 +992,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)  }  /* - * oem_force_hpet_timer -- force HPET mode for some boxes. + * apic_is_clustered_box() -- Check if we can expect good TSC   *   * Thus far, the major user of this is IBM's Summit2 series:   * @@ -991,7 +1000,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)   * multi-chassis. Use available data to take a good guess.   * If in doubt, go HPET.   */ -__cpuinit int oem_force_hpet_timer(void) +__cpuinit int apic_is_clustered_box(void)  {  	int i, clusters, zeros;  	unsigned id; @@ -1022,8 +1031,7 @@ __cpuinit int oem_force_hpet_timer(void)  	}  	/* -	 * If clusters > 2, then should be multi-chassis.  Return 1 for HPET. -	 * Else return 0 to use TSC. +	 * If clusters > 2, then should be multi-chassis.  	 * May have to revisit this when multi-core + hyperthreaded CPUs come  	 * out, but AFAIK this will work even for them.  	 */ diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 38834bbbae11..96687e2beb2c 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -4,6 +4,7 @@   * and format the required data.   */ +#include <linux/crypto.h>  #include <linux/sched.h>   #include <linux/stddef.h>  #include <linux/errno.h>  @@ -68,5 +69,7 @@ int main(void)  	DEFINE(pbe_next, offsetof(struct pbe, next));  	BLANK();  	DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); +	BLANK(); +	DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));  	return 0;  } diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 15beddc7fdf5..d8d5750d6106 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -111,14 +111,14 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)  	atomic_dec(&waiting_for_crash_ipi);  	/* Assume hlt works */  	for(;;) -		asm("hlt"); +		halt();  	return 1;  }  static void smp_send_nmi_allbutself(void)  { -	send_IPI_allbutself(APIC_DM_NMI); +	send_IPI_allbutself(NMI_VECTOR);  }  /* diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 1ef6028f721e..9e94d834624b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -1,7 +1,6 @@  /*    * Handle the memory map.   * The functions here do the job until bootmem takes over. - * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $   *   *  Getting sanitize_e820_map() in sync with i386 version by applying change:   *  -  Provisions for empty E820 memory regions (reported by certain BIOSes). @@ -621,6 +620,7 @@ void __init parse_memmapopt(char *p, char **from)  }  unsigned long pci_mem_start = 0xaeedbabe; +EXPORT_SYMBOL(pci_mem_start);  /*   * Search for the biggest gap in the low 32 bits of the e820 diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 586b34c00c48..7290e72b9a34 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -154,6 +154,7 @@ rff_trace:  	GET_THREAD_INFO(%rcx)	  	jmp rff_action  	CFI_ENDPROC +END(ret_from_fork)  /*   * System call entry. Upto 6 arguments in registers are supported. @@ -188,7 +189,7 @@ rff_trace:  ENTRY(system_call)  	CFI_STARTPROC	simple -	CFI_DEF_CFA	rsp,0 +	CFI_DEF_CFA	rsp,PDA_STACKOFFSET  	CFI_REGISTER	rip,rcx  	/*CFI_REGISTER	rflags,r11*/  	swapgs @@ -285,6 +286,7 @@ tracesys:  	/* Use IRET because user could have changed frame */  	jmp int_ret_from_sys_call  	CFI_ENDPROC +END(system_call)  /*    * Syscall return path ending with IRET. @@ -364,6 +366,7 @@ int_restore_rest:  	cli  	jmp int_with_check  	CFI_ENDPROC +END(int_ret_from_sys_call)  /*    * Certain special system calls that need to save a complete full stack frame. @@ -375,6 +378,7 @@ int_restore_rest:  	leaq	\func(%rip),%rax  	leaq    -ARGOFFSET+8(%rsp),\arg /* 8 for return address */  	jmp	ptregscall_common +END(\label)  	.endm  	CFI_STARTPROC @@ -404,6 +408,7 @@ ENTRY(ptregscall_common)  	CFI_REL_OFFSET rip, 0  	ret  	CFI_ENDPROC +END(ptregscall_common)  ENTRY(stub_execve)  	CFI_STARTPROC @@ -418,6 +423,7 @@ ENTRY(stub_execve)  	RESTORE_REST  	jmp int_ret_from_sys_call  	CFI_ENDPROC +END(stub_execve)  /*   * sigreturn is special because it needs to restore all registers on return. @@ -435,6 +441,7 @@ ENTRY(stub_rt_sigreturn)  	RESTORE_REST  	jmp int_ret_from_sys_call  	CFI_ENDPROC +END(stub_rt_sigreturn)  /*   * initial frame state for interrupts and exceptions @@ -466,29 +473,18 @@ ENTRY(stub_rt_sigreturn)  /* 0(%rsp): interrupt number */   	.macro interrupt func  	cld -#ifdef CONFIG_DEBUG_INFO -	SAVE_ALL	 -	movq %rsp,%rdi -	/* -	 * Setup a stack frame pointer.  This allows gdb to trace -	 * back to the original stack. -	 */ -	movq %rsp,%rbp -	CFI_DEF_CFA_REGISTER	rbp -#else		  	SAVE_ARGS  	leaq -ARGOFFSET(%rsp),%rdi	# arg1 for handler -#endif	 +	pushq %rbp +	CFI_ADJUST_CFA_OFFSET	8 +	CFI_REL_OFFSET		rbp, 0 +	movq %rsp,%rbp +	CFI_DEF_CFA_REGISTER	rbp  	testl $3,CS(%rdi)  	je 1f  	swapgs	  1:	incl	%gs:pda_irqcount	# RED-PEN should check preempt count -	movq %gs:pda_irqstackptr,%rax -	cmoveq %rax,%rsp /*todo This needs CFI annotation! */ -	pushq %rdi			# save old stack	 -#ifndef CONFIG_DEBUG_INFO -	CFI_ADJUST_CFA_OFFSET	8 -#endif +	cmoveq %gs:pda_irqstackptr,%rsp  	call \func  	.endm @@ -497,17 +493,11 @@ ENTRY(common_interrupt)  	interrupt do_IRQ  	/* 0(%rsp): oldrsp-ARGOFFSET */  ret_from_intr: -	popq  %rdi -#ifndef CONFIG_DEBUG_INFO -	CFI_ADJUST_CFA_OFFSET	-8 -#endif  	cli	  	decl %gs:pda_irqcount -#ifdef CONFIG_DEBUG_INFO -	movq RBP(%rdi),%rbp +	leaveq  	CFI_DEF_CFA_REGISTER	rsp -#endif -	leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */ +	CFI_ADJUST_CFA_OFFSET	-8  exit_intr:  	GET_THREAD_INFO(%rcx)  	testl $3,CS-ARGOFFSET(%rsp) @@ -589,7 +579,9 @@ retint_kernel:  	call preempt_schedule_irq  	jmp exit_intr  #endif	 +  	CFI_ENDPROC +END(common_interrupt)  /*   * APIC interrupts. @@ -605,17 +597,21 @@ retint_kernel:  ENTRY(thermal_interrupt)  	apicinterrupt THERMAL_APIC_VECTOR,smp_thermal_interrupt +END(thermal_interrupt)  ENTRY(threshold_interrupt)  	apicinterrupt THRESHOLD_APIC_VECTOR,mce_threshold_interrupt +END(threshold_interrupt)  #ifdef CONFIG_SMP	  ENTRY(reschedule_interrupt)  	apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt +END(reschedule_interrupt)  	.macro INVALIDATE_ENTRY num  ENTRY(invalidate_interrupt\num)  	apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt	 +END(invalidate_interrupt\num)  	.endm  	INVALIDATE_ENTRY 0 @@ -629,17 +625,21 @@ ENTRY(invalidate_interrupt\num)  ENTRY(call_function_interrupt)  	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt +END(call_function_interrupt)  #endif  #ifdef CONFIG_X86_LOCAL_APIC	  ENTRY(apic_timer_interrupt)  	apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt +END(apic_timer_interrupt)  ENTRY(error_interrupt)  	apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt +END(error_interrupt)  ENTRY(spurious_interrupt)  	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt +END(spurious_interrupt)  #endif  /* @@ -777,6 +777,7 @@ error_kernelspace:  	cmpq $gs_change,RIP(%rsp)          je   error_swapgs  	jmp  error_sti +END(error_entry)         /* Reload gs selector with exception handling */         /* edi:  new selector */  @@ -794,6 +795,7 @@ gs_change:  	CFI_ADJUST_CFA_OFFSET -8          ret  	CFI_ENDPROC +ENDPROC(load_gs_index)          .section __ex_table,"a"          .align 8 @@ -847,7 +849,7 @@ ENTRY(kernel_thread)  	UNFAKE_STACK_FRAME  	ret  	CFI_ENDPROC - +ENDPROC(kernel_thread)  child_rip:  	/* @@ -860,6 +862,7 @@ child_rip:  	# exit  	xorl %edi, %edi  	call do_exit +ENDPROC(child_rip)  /*   * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. @@ -889,19 +892,24 @@ ENTRY(execve)  	UNFAKE_STACK_FRAME  	ret  	CFI_ENDPROC +ENDPROC(execve)  KPROBE_ENTRY(page_fault)  	errorentry do_page_fault +END(page_fault)  	.previous .text  ENTRY(coprocessor_error)  	zeroentry do_coprocessor_error +END(coprocessor_error)  ENTRY(simd_coprocessor_error)  	zeroentry do_simd_coprocessor_error	 +END(simd_coprocessor_error)  ENTRY(device_not_available)  	zeroentry math_state_restore +END(device_not_available)  	/* runs on exception stack */  KPROBE_ENTRY(debug) @@ -911,6 +919,7 @@ KPROBE_ENTRY(debug)  	paranoidentry do_debug, DEBUG_STACK  	jmp paranoid_exit  	CFI_ENDPROC +END(debug)  	.previous .text  	/* runs on exception stack */	 @@ -961,6 +970,7 @@ paranoid_schedule:  	cli  	jmp paranoid_userspace  	CFI_ENDPROC +END(nmi)  	.previous .text  KPROBE_ENTRY(int3) @@ -970,22 +980,28 @@ KPROBE_ENTRY(int3)   	paranoidentry do_int3, DEBUG_STACK   	jmp paranoid_exit   	CFI_ENDPROC +END(int3)  	.previous .text  ENTRY(overflow)  	zeroentry do_overflow +END(overflow)  ENTRY(bounds)  	zeroentry do_bounds +END(bounds)  ENTRY(invalid_op)  	zeroentry do_invalid_op	 +END(invalid_op)  ENTRY(coprocessor_segment_overrun)  	zeroentry do_coprocessor_segment_overrun +END(coprocessor_segment_overrun)  ENTRY(reserved)  	zeroentry do_reserved +END(reserved)  	/* runs on exception stack */  ENTRY(double_fault) @@ -993,12 +1009,15 @@ ENTRY(double_fault)  	paranoidentry do_double_fault  	jmp paranoid_exit  	CFI_ENDPROC +END(double_fault)  ENTRY(invalid_TSS)  	errorentry do_invalid_TSS +END(invalid_TSS)  ENTRY(segment_not_present)  	errorentry do_segment_not_present +END(segment_not_present)  	/* runs on exception stack */  ENTRY(stack_segment) @@ -1006,19 +1025,24 @@ ENTRY(stack_segment)  	paranoidentry do_stack_segment  	jmp paranoid_exit  	CFI_ENDPROC +END(stack_segment)  KPROBE_ENTRY(general_protection)  	errorentry do_general_protection +END(general_protection)  	.previous .text  ENTRY(alignment_check)  	errorentry do_alignment_check +END(alignment_check)  ENTRY(divide_error)  	zeroentry do_divide_error +END(divide_error)  ENTRY(spurious_interrupt_bug)  	zeroentry do_spurious_interrupt_bug +END(spurious_interrupt_bug)  #ifdef CONFIG_X86_MCE  	/* runs on exception stack */ @@ -1029,6 +1053,7 @@ ENTRY(machine_check)  	paranoidentry do_machine_check  	jmp paranoid_exit  	CFI_ENDPROC +END(machine_check)  #endif  ENTRY(call_softirq) @@ -1046,3 +1071,37 @@ ENTRY(call_softirq)  	decl %gs:pda_irqcount  	ret  	CFI_ENDPROC +ENDPROC(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) +	CFI_STARTPROC +	movq	%r15, R15(%rdi) +	movq	%r14, R14(%rdi) +	xchgq	%rsi, %rdx +	movq	%r13, R13(%rdi) +	movq	%r12, R12(%rdi) +	xorl	%eax, %eax +	movq	%rbp, RBP(%rdi) +	movq	%rbx, RBX(%rdi) +	movq	(%rsp), %rcx +	movq	%rax, R11(%rdi) +	movq	%rax, R10(%rdi) +	movq	%rax, R9(%rdi) +	movq	%rax, R8(%rdi) +	movq	%rax, RAX(%rdi) +	movq	%rax, RCX(%rdi) +	movq	%rax, RDX(%rdi) +	movq	%rax, RSI(%rdi) +	movq	%rax, RDI(%rdi) +	movq	%rax, ORIG_RAX(%rdi) +	movq	%rcx, RIP(%rdi) +	leaq	8(%rsp), %rcx +	movq	$__KERNEL_CS, CS(%rdi) +	movq	%rax, EFLAGS(%rdi) +	movq	%rcx, RSP(%rdi) +	movq	$__KERNEL_DS, SS(%rdi) +	jmpq	*%rdx +	CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 1a2ab825be98..21c7066e236a 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -78,22 +78,29 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)  static void flat_send_IPI_allbutself(int vector)  { -#ifndef CONFIG_HOTPLUG_CPU -	if (((num_online_cpus()) - 1) >= 1) -		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +#ifdef	CONFIG_HOTPLUG_CPU +	int hotplug = 1;  #else -	cpumask_t allbutme = cpu_online_map; +	int hotplug = 0; +#endif +	if (hotplug || vector == NMI_VECTOR) { +		cpumask_t allbutme = cpu_online_map; -	cpu_clear(smp_processor_id(), allbutme); +		cpu_clear(smp_processor_id(), allbutme); -	if (!cpus_empty(allbutme)) -		flat_send_IPI_mask(allbutme, vector); -#endif +		if (!cpus_empty(allbutme)) +			flat_send_IPI_mask(allbutme, vector); +	} else if (num_online_cpus() > 1) { +		__send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +	}  }  static void flat_send_IPI_all(int vector)  { -	__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +	if (vector == NMI_VECTOR) +		flat_send_IPI_mask(cpu_online_map, vector); +	else +		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);  }  static int flat_apic_id_registered(void) @@ -108,10 +115,7 @@ static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)  static unsigned int phys_pkg_id(int index_msb)  { -	u32 ebx; - -	ebx = cpuid_ebx(1); -	return ((ebx >> 24) & 0xFF) >> index_msb; +	return hard_smp_processor_id() >> index_msb;  }  struct genapic apic_flat =  { diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index cea20a66c150..e6a71c9556d9 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -2,8 +2,6 @@   *  linux/arch/x86_64/kernel/head64.c -- prepare to run common code   *   *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE - * - *  $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $   */  #include <linux/init.h> diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 573990a6d668..86b2c1e197aa 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -44,11 +44,11 @@  	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \  	BI(x,c) BI(x,d) BI(x,e) BI(x,f) -#define BUILD_14_IRQS(x) \ +#define BUILD_15_IRQS(x) \  	BI(x,0) BI(x,1) BI(x,2) BI(x,3) \  	BI(x,4) BI(x,5) BI(x,6) BI(x,7) \  	BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ -	BI(x,c) BI(x,d) +	BI(x,c) BI(x,d) BI(x,e)  /*   * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: @@ -73,13 +73,13 @@ BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)  BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)  #ifdef CONFIG_PCI_MSI -	BUILD_14_IRQS(0xe) +	BUILD_15_IRQS(0xe)  #endif  #endif  #undef BUILD_16_IRQS -#undef BUILD_14_IRQS +#undef BUILD_15_IRQS  #undef BI @@ -92,11 +92,11 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd)  	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \  	IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -#define IRQLIST_14(x) \ +#define IRQLIST_15(x) \  	IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \  	IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \  	IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ -	IRQ(x,c), IRQ(x,d) +	IRQ(x,c), IRQ(x,d), IRQ(x,e)  void (*interrupt[NR_IRQS])(void) = {  	IRQLIST_16(0x0), @@ -108,7 +108,7 @@ void (*interrupt[NR_IRQS])(void) = {  	IRQLIST_16(0xc), IRQLIST_16(0xd)  #ifdef CONFIG_PCI_MSI -	, IRQLIST_14(0xe) +	, IRQLIST_15(0xe)  #endif  #endif diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 9cc7031b7151..c768d8a036d0 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -41,6 +41,7 @@  #include <asm/mach_apic.h>  #include <asm/acpi.h>  #include <asm/dma.h> +#include <asm/nmi.h>  #define __apicdebuginit  __init @@ -56,6 +57,7 @@ int timer_over_8254 __initdata = 0;  static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };  static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock);  /*   * # of IRQ routing registers @@ -317,7 +319,7 @@ void __init check_ioapic(void)  				vendor &= 0xffff;  				switch (vendor) {   				case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_GART_IOMMU +#ifdef CONFIG_IOMMU  					if ((end_pfn > MAX_DMA32_PFN ||  					     force_iommu) &&  					    !iommu_aperture_allowed) { @@ -834,10 +836,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };  int assign_irq_vector(int irq)  {  	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; +	unsigned long flags; +	int vector;  	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); -	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + +	spin_lock_irqsave(&vector_lock, flags); + +	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { +		spin_unlock_irqrestore(&vector_lock, flags);  		return IO_APIC_VECTOR(irq); +	}  next:  	current_vector += 8;  	if (current_vector == IA32_SYSCALL_VECTOR) @@ -849,11 +858,14 @@ next:  		current_vector = FIRST_DEVICE_VECTOR + offset;  	} -	vector_irq[current_vector] = irq; +	vector = current_vector; +	vector_irq[vector] = irq;  	if (irq != AUTO_ASSIGN) -		IO_APIC_VECTOR(irq) = current_vector; +		IO_APIC_VECTOR(irq) = vector; + +	spin_unlock_irqrestore(&vector_lock, flags); -	return current_vector; +	return vector;  }  extern void (*interrupt[NR_IRQS])(void); @@ -866,21 +878,14 @@ static struct hw_interrupt_type ioapic_edge_type;  static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)  { -	if (use_pci_vector() && !platform_legacy_irq(irq)) { -		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -				trigger == IOAPIC_LEVEL) -			irq_desc[vector].handler = &ioapic_level_type; -		else -			irq_desc[vector].handler = &ioapic_edge_type; -		set_intr_gate(vector, interrupt[vector]); -	} else	{ -		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || -				trigger == IOAPIC_LEVEL) -			irq_desc[irq].handler = &ioapic_level_type; -		else -			irq_desc[irq].handler = &ioapic_edge_type; -		set_intr_gate(vector, interrupt[irq]); -	} +	unsigned idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + +	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || +			trigger == IOAPIC_LEVEL) +		irq_desc[idx].handler = &ioapic_level_type; +	else +		irq_desc[idx].handler = &ioapic_edge_type; +	set_intr_gate(vector, interrupt[idx]);  }  static void __init setup_IO_APIC_irqs(void) diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index d8bd0b345b1e..59518d4d4358 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -26,6 +26,30 @@ atomic_t irq_mis_count;  #endif  #endif +#ifdef CONFIG_DEBUG_STACKOVERFLOW +/* + * Probabilistic stack overflow check: + * + * Only check the stack in process context, because everything else + * runs on the big interrupt stacks. Checking reliably is too expensive, + * so we just check from interrupts. + */ +static inline void stack_overflow_check(struct pt_regs *regs) +{ +	u64 curbase = (u64) current->thread_info; +	static unsigned long warned = -60*HZ; + +	if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && +	    regs->rsp <  curbase + sizeof(struct thread_info) + 128 && +	    time_after(jiffies, warned + 60*HZ)) { +		printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", +		       current->comm, curbase, regs->rsp); +		show_stack(NULL,NULL); +		warned = jiffies; +	} +} +#endif +  /*   * Generic, controller-independent functions:   */ @@ -39,7 +63,7 @@ int show_interrupts(struct seq_file *p, void *v)  	if (i == 0) {  		seq_printf(p, "           ");  		for_each_online_cpu(j) -			seq_printf(p, "CPU%d       ",j); +			seq_printf(p, "CPU%-8d",j);  		seq_putc(p, '\n');  	} @@ -96,7 +120,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)  	exit_idle();  	irq_enter(); - +#ifdef CONFIG_DEBUG_STACKOVERFLOW +	stack_overflow_check(regs); +#endif  	__do_IRQ(irq, regs);  	irq_exit(); diff --git a/arch/x86_64/kernel/k8.c b/arch/x86_64/kernel/k8.c new file mode 100644 index 000000000000..6416682d33d0 --- /dev/null +++ b/arch/x86_64/kernel/k8.c @@ -0,0 +1,118 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/k8.h> + +int num_k8_northbridges; +EXPORT_SYMBOL(num_k8_northbridges); + +static u32 *flush_words; + +struct pci_device_id k8_nb_ids[] = { +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, +	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, +	{} +}; +EXPORT_SYMBOL(k8_nb_ids); + +struct pci_dev **k8_northbridges; +EXPORT_SYMBOL(k8_northbridges); + +static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +{ +	do { +		dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); +		if (!dev) +			break; +	} while (!pci_match_id(&k8_nb_ids[0], dev)); +	return dev; +} + +int cache_k8_northbridges(void) +{ +	int i; +	struct pci_dev *dev; +	if (num_k8_northbridges) +		return 0; + +	num_k8_northbridges = 0; +	dev = NULL; +	while ((dev = next_k8_northbridge(dev)) != NULL) +		num_k8_northbridges++; + +	k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), +				  GFP_KERNEL); +	if (!k8_northbridges) +		return -ENOMEM; + +	flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); +	if (!flush_words) { +		kfree(k8_northbridges); +		return -ENOMEM; +	} + +	dev = NULL; +	i = 0; +	while ((dev = next_k8_northbridge(dev)) != NULL) { +		k8_northbridges[i++] = dev; +		pci_read_config_dword(dev, 0x9c, &flush_words[i]); +	} +	k8_northbridges[i] = NULL; +	return 0; +} +EXPORT_SYMBOL_GPL(cache_k8_northbridges); + +/* Ignores subdevice/subvendor but as far as I can figure out +   they're useless anyways */ +int __init early_is_k8_nb(u32 device) +{ +	struct pci_device_id *id; +	u32 vendor = device & 0xffff; +	device >>= 16; +	for (id = k8_nb_ids; id->vendor; id++) +		if (vendor == id->vendor && device == id->device) +			return 1; +	return 0; +} + +void k8_flush_garts(void) +{ +	int flushed, i; +	unsigned long flags; +	static DEFINE_SPINLOCK(gart_lock); + +	/* Avoid races between AGP and IOMMU. In theory it's not needed +	   but I'm not sure if the hardware won't lose flush requests +	   when another is pending. This whole thing is so expensive anyways +	   that it doesn't matter to serialize more. -AK */ +	spin_lock_irqsave(&gart_lock, flags); +	flushed = 0; +	for (i = 0; i < num_k8_northbridges; i++) { +		pci_write_config_dword(k8_northbridges[i], 0x9c, +				       flush_words[i]|1); +		flushed++; +	} +	for (i = 0; i < num_k8_northbridges; i++) { +		u32 w; +		/* Make sure the hardware actually executed the flush*/ +		for (;;) { +			pci_read_config_dword(k8_northbridges[i], +					      0x9c, &w); +			if (!(w & 1)) +				break; +			cpu_relax(); +		} +	} +	spin_unlock_irqrestore(&gart_lock, flags); +	if (!flushed) +		printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(k8_flush_garts); + diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index c69fc43cee7b..acd5816b1a6f 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -562,7 +562,7 @@ static struct sysdev_class mce_sysclass = {  	set_kset_name("machinecheck"),  }; -static DEFINE_PER_CPU(struct sys_device, device_mce); +DEFINE_PER_CPU(struct sys_device, device_mce);  /* Why are there no generic functions for this? */  #define ACCESSOR(name, var, start) \ diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index d13b241ad094..335200aa2737 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -1,5 +1,5 @@  /* - *  (c) 2005 Advanced Micro Devices, Inc. + *  (c) 2005, 2006 Advanced Micro Devices, Inc.   *  Your use of this code is subject to the terms and conditions of the   *  GNU general public license version 2. See "COPYING" or   *  http://www.gnu.org/licenses/gpl.html @@ -8,9 +8,10 @@   *   *  Support : jacob.shin@amd.com   * - *  MC4_MISC0 DRAM ECC Error Threshold available under AMD K8 Rev F. - *  MC4_MISC0 exists per physical processor. + *  April 2006 + *     - added support for AMD Family 0x10 processors   * + *  All MC4_MISCi registers are shared between multi-cores   */  #include <linux/cpu.h> @@ -29,32 +30,45 @@  #include <asm/percpu.h>  #include <asm/idle.h> -#define PFX "mce_threshold: " -#define VERSION "version 1.00.9" -#define NR_BANKS 5 -#define THRESHOLD_MAX 0xFFF -#define INT_TYPE_APIC 0x00020000 -#define MASK_VALID_HI 0x80000000 -#define MASK_LVTOFF_HI 0x00F00000 -#define MASK_COUNT_EN_HI 0x00080000 -#define MASK_INT_TYPE_HI 0x00060000 -#define MASK_OVERFLOW_HI 0x00010000 +#define PFX               "mce_threshold: " +#define VERSION           "version 1.1.1" +#define NR_BANKS          6 +#define NR_BLOCKS         9 +#define THRESHOLD_MAX     0xFFF +#define INT_TYPE_APIC     0x00020000 +#define MASK_VALID_HI     0x80000000 +#define MASK_LVTOFF_HI    0x00F00000 +#define MASK_COUNT_EN_HI  0x00080000 +#define MASK_INT_TYPE_HI  0x00060000 +#define MASK_OVERFLOW_HI  0x00010000  #define MASK_ERR_COUNT_HI 0x00000FFF -#define MASK_OVERFLOW 0x0001000000000000L +#define MASK_BLKPTR_LO    0xFF000000 +#define MCG_XBLK_ADDR     0xC0000400 -struct threshold_bank { +struct threshold_block { +	unsigned int block; +	unsigned int bank;  	unsigned int cpu; -	u8 bank; -	u8 interrupt_enable; +	u32 address; +	u16 interrupt_enable;  	u16 threshold_limit;  	struct kobject kobj; +	struct list_head miscj;  }; -static struct threshold_bank threshold_defaults = { +/* defaults used early on boot */ +static struct threshold_block threshold_defaults = {  	.interrupt_enable = 0,  	.threshold_limit = THRESHOLD_MAX,  }; +struct threshold_bank { +	struct kobject kobj; +	struct threshold_block *blocks; +	cpumask_t cpus; +}; +static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +  #ifdef CONFIG_SMP  static unsigned char shared_bank[NR_BANKS] = {  	0, 0, 0, 0, 1 @@ -68,12 +82,12 @@ static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */   */  /* must be called with correct cpu affinity */ -static void threshold_restart_bank(struct threshold_bank *b, +static void threshold_restart_bank(struct threshold_block *b,  				   int reset, u16 old_limit)  {  	u32 mci_misc_hi, mci_misc_lo; -	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); +	rdmsr(b->address, mci_misc_lo, mci_misc_hi);  	if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))  		reset = 1;	/* limit cannot be lower than err count */ @@ -94,35 +108,57 @@ static void threshold_restart_bank(struct threshold_bank *b,  	    (mci_misc_hi &= ~MASK_INT_TYPE_HI);  	mci_misc_hi |= MASK_COUNT_EN_HI; -	wrmsr(MSR_IA32_MC0_MISC + b->bank * 4, mci_misc_lo, mci_misc_hi); +	wrmsr(b->address, mci_misc_lo, mci_misc_hi);  } +/* cpu init entry point, called from mce.c with preempt off */  void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)  { -	int bank; -	u32 mci_misc_lo, mci_misc_hi; +	unsigned int bank, block;  	unsigned int cpu = smp_processor_id(); +	u32 low = 0, high = 0, address = 0;  	for (bank = 0; bank < NR_BANKS; ++bank) { -		rdmsr(MSR_IA32_MC0_MISC + bank * 4, mci_misc_lo, mci_misc_hi); +		for (block = 0; block < NR_BLOCKS; ++block) { +			if (block == 0) +				address = MSR_IA32_MC0_MISC + bank * 4; +			else if (block == 1) +				address = MCG_XBLK_ADDR +					+ ((low & MASK_BLKPTR_LO) >> 21); +			else +				++address; + +			if (rdmsr_safe(address, &low, &high)) +				continue; -		/* !valid, !counter present, bios locked */ -		if (!(mci_misc_hi & MASK_VALID_HI) || -		    !(mci_misc_hi & MASK_VALID_HI >> 1) || -		    (mci_misc_hi & MASK_VALID_HI >> 2)) -			continue; +			if (!(high & MASK_VALID_HI)) { +				if (block) +					continue; +				else +					break; +			} -		per_cpu(bank_map, cpu) |= (1 << bank); +			if (!(high & MASK_VALID_HI >> 1)  || +			     (high & MASK_VALID_HI >> 2)) +				continue; +			if (!block) +				per_cpu(bank_map, cpu) |= (1 << bank);  #ifdef CONFIG_SMP -		if (shared_bank[bank] && cpu_core_id[cpu]) -			continue; +			if (shared_bank[bank] && c->cpu_core_id) +				break;  #endif +			high &= ~MASK_LVTOFF_HI; +			high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; +			wrmsr(address, low, high); -		setup_threshold_lvt((mci_misc_hi & MASK_LVTOFF_HI) >> 20); -		threshold_defaults.cpu = cpu; -		threshold_defaults.bank = bank; -		threshold_restart_bank(&threshold_defaults, 0, 0); +			setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, +					       THRESHOLD_APIC_VECTOR, +					       K8_APIC_EXT_INT_MSG_FIX, 0); + +			threshold_defaults.address = address; +			threshold_restart_bank(&threshold_defaults, 0, 0); +		}  	}  } @@ -137,8 +173,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)   */  asmlinkage void mce_threshold_interrupt(void)  { -	int bank; +	unsigned int bank, block;  	struct mce m; +	u32 low = 0, high = 0, address = 0;  	ack_APIC_irq();  	exit_idle(); @@ -150,15 +187,42 @@ asmlinkage void mce_threshold_interrupt(void)  	/* assume first bank caused it */  	for (bank = 0; bank < NR_BANKS; ++bank) { -		m.bank = MCE_THRESHOLD_BASE + bank; -		rdmsrl(MSR_IA32_MC0_MISC + bank * 4, m.misc); +		for (block = 0; block < NR_BLOCKS; ++block) { +			if (block == 0) +				address = MSR_IA32_MC0_MISC + bank * 4; +			else if (block == 1) +				address = MCG_XBLK_ADDR +					+ ((low & MASK_BLKPTR_LO) >> 21); +			else +				++address; + +			if (rdmsr_safe(address, &low, &high)) +				continue; -		if (m.misc & MASK_OVERFLOW) { -			mce_log(&m); -			goto out; +			if (!(high & MASK_VALID_HI)) { +				if (block) +					continue; +				else +					break; +			} + +			if (!(high & MASK_VALID_HI >> 1)  || +			     (high & MASK_VALID_HI >> 2)) +				continue; + +			if (high & MASK_OVERFLOW_HI) { +				rdmsrl(address, m.misc); +				rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, +				       m.status); +				m.bank = K8_MCE_THRESHOLD_BASE +				       + bank * NR_BLOCKS +				       + block; +				mce_log(&m); +				goto out; +			}  		}  	} -      out: +out:  	irq_exit();  } @@ -166,20 +230,12 @@ asmlinkage void mce_threshold_interrupt(void)   * Sysfs Interface   */ -static struct sysdev_class threshold_sysclass = { -	set_kset_name("threshold"), -}; - -static DEFINE_PER_CPU(struct sys_device, device_threshold); -  struct threshold_attr { -        struct attribute attr; -        ssize_t(*show) (struct threshold_bank *, char *); -        ssize_t(*store) (struct threshold_bank *, const char *, size_t count); +	struct attribute attr; +	ssize_t(*show) (struct threshold_block *, char *); +	ssize_t(*store) (struct threshold_block *, const char *, size_t count);  }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); -  static cpumask_t affinity_set(unsigned int cpu)  {  	cpumask_t oldmask = current->cpus_allowed; @@ -194,15 +250,15 @@ static void affinity_restore(cpumask_t oldmask)  	set_cpus_allowed(current, oldmask);  } -#define SHOW_FIELDS(name) \ -        static ssize_t show_ ## name(struct threshold_bank * b, char *buf) \ -        { \ -                return sprintf(buf, "%lx\n", (unsigned long) b->name); \ -        } +#define SHOW_FIELDS(name)                                           \ +static ssize_t show_ ## name(struct threshold_block * b, char *buf) \ +{                                                                   \ +        return sprintf(buf, "%lx\n", (unsigned long) b->name);      \ +}  SHOW_FIELDS(interrupt_enable)  SHOW_FIELDS(threshold_limit) -static ssize_t store_interrupt_enable(struct threshold_bank *b, +static ssize_t store_interrupt_enable(struct threshold_block *b,  				      const char *buf, size_t count)  {  	char *end; @@ -219,7 +275,7 @@ static ssize_t store_interrupt_enable(struct threshold_bank *b,  	return end - buf;  } -static ssize_t store_threshold_limit(struct threshold_bank *b, +static ssize_t store_threshold_limit(struct threshold_block *b,  				     const char *buf, size_t count)  {  	char *end; @@ -242,18 +298,18 @@ static ssize_t store_threshold_limit(struct threshold_bank *b,  	return end - buf;  } -static ssize_t show_error_count(struct threshold_bank *b, char *buf) +static ssize_t show_error_count(struct threshold_block *b, char *buf)  {  	u32 high, low;  	cpumask_t oldmask;  	oldmask = affinity_set(b->cpu); -	rdmsr(MSR_IA32_MC0_MISC + b->bank * 4, low, high); /* ignore low 32 */ +	rdmsr(b->address, low, high);  	affinity_restore(oldmask);  	return sprintf(buf, "%x\n",  		       (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));  } -static ssize_t store_error_count(struct threshold_bank *b, +static ssize_t store_error_count(struct threshold_block *b,  				 const char *buf, size_t count)  {  	cpumask_t oldmask; @@ -269,13 +325,13 @@ static ssize_t store_error_count(struct threshold_bank *b,          .store = _store,                                      \  }; -#define ATTR_FIELDS(name) \ -        static struct threshold_attr name = \ +#define RW_ATTR(name)                                           \ +static struct threshold_attr name =                             \          THRESHOLD_ATTR(name, 0644, show_## name, store_## name) -ATTR_FIELDS(interrupt_enable); -ATTR_FIELDS(threshold_limit); -ATTR_FIELDS(error_count); +RW_ATTR(interrupt_enable); +RW_ATTR(threshold_limit); +RW_ATTR(error_count);  static struct attribute *default_attrs[] = {  	&interrupt_enable.attr, @@ -284,12 +340,12 @@ static struct attribute *default_attrs[] = {  	NULL  }; -#define to_bank(k) container_of(k,struct threshold_bank,kobj) -#define to_attr(a) container_of(a,struct threshold_attr,attr) +#define to_block(k) container_of(k, struct threshold_block, kobj) +#define to_attr(a) container_of(a, struct threshold_attr, attr)  static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)  { -	struct threshold_bank *b = to_bank(kobj); +	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret;  	ret = a->show ? a->show(b, buf) : -EIO; @@ -299,7 +355,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)  static ssize_t store(struct kobject *kobj, struct attribute *attr,  		     const char *buf, size_t count)  { -	struct threshold_bank *b = to_bank(kobj); +	struct threshold_block *b = to_block(kobj);  	struct threshold_attr *a = to_attr(attr);  	ssize_t ret;  	ret = a->store ? a->store(b, buf, count) : -EIO; @@ -316,69 +372,174 @@ static struct kobj_type threshold_ktype = {  	.default_attrs = default_attrs,  }; +static __cpuinit int allocate_threshold_blocks(unsigned int cpu, +					       unsigned int bank, +					       unsigned int block, +					       u32 address) +{ +	int err; +	u32 low, high; +	struct threshold_block *b = NULL; + +	if ((bank >= NR_BANKS) || (block >= NR_BLOCKS)) +		return 0; + +	if (rdmsr_safe(address, &low, &high)) +		goto recurse; + +	if (!(high & MASK_VALID_HI)) { +		if (block) +			goto recurse; +		else +			return 0; +	} + +	if (!(high & MASK_VALID_HI >> 1)  || +	     (high & MASK_VALID_HI >> 2)) +		goto recurse; + +	b = kzalloc(sizeof(struct threshold_block), GFP_KERNEL); +	if (!b) +		return -ENOMEM; +	memset(b, 0, sizeof(struct threshold_block)); + +	b->block = block; +	b->bank = bank; +	b->cpu = cpu; +	b->address = address; +	b->interrupt_enable = 0; +	b->threshold_limit = THRESHOLD_MAX; + +	INIT_LIST_HEAD(&b->miscj); + +	if (per_cpu(threshold_banks, cpu)[bank]->blocks) +		list_add(&b->miscj, +			 &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj); +	else +		per_cpu(threshold_banks, cpu)[bank]->blocks = b; + +	kobject_set_name(&b->kobj, "misc%i", block); +	b->kobj.parent = &per_cpu(threshold_banks, cpu)[bank]->kobj; +	b->kobj.ktype = &threshold_ktype; +	err = kobject_register(&b->kobj); +	if (err) +		goto out_free; +recurse: +	if (!block) { +		address = (low & MASK_BLKPTR_LO) >> 21; +		if (!address) +			return 0; +		address += MCG_XBLK_ADDR; +	} else +		++address; + +	err = allocate_threshold_blocks(cpu, bank, ++block, address); +	if (err) +		goto out_free; + +	return err; + +out_free: +	if (b) { +		kobject_unregister(&b->kobj); +		kfree(b); +	} +	return err; +} +  /* symlinks sibling shared banks to first core.  first core owns dir/files. */ -static __cpuinit int threshold_create_bank(unsigned int cpu, int bank) +static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)  { -	int err = 0; +	int i, err = 0;  	struct threshold_bank *b = NULL; +	cpumask_t oldmask = CPU_MASK_NONE; +	char name[32]; + +	sprintf(name, "threshold_bank%i", bank);  #ifdef CONFIG_SMP -	if (cpu_core_id[cpu] && shared_bank[bank]) {	/* symlink */ -		char name[16]; -		unsigned lcpu = first_cpu(cpu_core_map[cpu]); -		if (cpu_core_id[lcpu]) -			goto out;	/* first core not up yet */ +	if (cpu_data[cpu].cpu_core_id && shared_bank[bank]) {	/* symlink */ +		i = first_cpu(cpu_core_map[cpu]); + +		/* first core not up yet */ +		if (cpu_data[i].cpu_core_id) +			goto out; + +		/* already linked */ +		if (per_cpu(threshold_banks, cpu)[bank]) +			goto out; + +		b = per_cpu(threshold_banks, i)[bank]; -		b = per_cpu(threshold_banks, lcpu)[bank];  		if (!b)  			goto out; -		sprintf(name, "bank%i", bank); -		err = sysfs_create_link(&per_cpu(device_threshold, cpu).kobj, + +		err = sysfs_create_link(&per_cpu(device_mce, cpu).kobj,  					&b->kobj, name);  		if (err)  			goto out; + +		b->cpus = cpu_core_map[cpu];  		per_cpu(threshold_banks, cpu)[bank] = b;  		goto out;  	}  #endif -	b = kmalloc(sizeof(struct threshold_bank), GFP_KERNEL); +	b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);  	if (!b) {  		err = -ENOMEM;  		goto out;  	}  	memset(b, 0, sizeof(struct threshold_bank)); -	b->cpu = cpu; -	b->bank = bank; -	b->interrupt_enable = 0; -	b->threshold_limit = THRESHOLD_MAX; -	kobject_set_name(&b->kobj, "bank%i", bank); -	b->kobj.parent = &per_cpu(device_threshold, cpu).kobj; -	b->kobj.ktype = &threshold_ktype; - +	kobject_set_name(&b->kobj, "threshold_bank%i", bank); +	b->kobj.parent = &per_cpu(device_mce, cpu).kobj; +#ifndef CONFIG_SMP +	b->cpus = CPU_MASK_ALL; +#else +	b->cpus = cpu_core_map[cpu]; +#endif  	err = kobject_register(&b->kobj); -	if (err) { -		kfree(b); -		goto out; -	} +	if (err) +		goto out_free; +  	per_cpu(threshold_banks, cpu)[bank] = b; -      out: + +	oldmask = affinity_set(cpu); +	err = allocate_threshold_blocks(cpu, bank, 0, +					MSR_IA32_MC0_MISC + bank * 4); +	affinity_restore(oldmask); + +	if (err) +		goto out_free; + +	for_each_cpu_mask(i, b->cpus) { +		if (i == cpu) +			continue; + +		err = sysfs_create_link(&per_cpu(device_mce, i).kobj, +					&b->kobj, name); +		if (err) +			goto out; + +		per_cpu(threshold_banks, i)[bank] = b; +	} + +	goto out; + +out_free: +	per_cpu(threshold_banks, cpu)[bank] = NULL; +	kfree(b); +out:  	return err;  }  /* create dir/files for all valid threshold banks */  static __cpuinit int threshold_create_device(unsigned int cpu)  { -	int bank; +	unsigned int bank;  	int err = 0; -	per_cpu(device_threshold, cpu).id = cpu; -	per_cpu(device_threshold, cpu).cls = &threshold_sysclass; -	err = sysdev_register(&per_cpu(device_threshold, cpu)); -	if (err) -		goto out; -  	for (bank = 0; bank < NR_BANKS; ++bank) {  		if (!(per_cpu(bank_map, cpu) & 1 << bank))  			continue; @@ -386,7 +547,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu)  		if (err)  			goto out;  	} -      out: +out:  	return err;  } @@ -397,92 +558,85 @@ static __cpuinit int threshold_create_device(unsigned int cpu)   *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.   */ -/* cpu hotplug call removes all symlinks before first core dies */ +static __cpuinit void deallocate_threshold_block(unsigned int cpu, +						 unsigned int bank) +{ +	struct threshold_block *pos = NULL; +	struct threshold_block *tmp = NULL; +	struct threshold_bank *head = per_cpu(threshold_banks, cpu)[bank]; + +	if (!head) +		return; + +	list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) { +		kobject_unregister(&pos->kobj); +		list_del(&pos->miscj); +		kfree(pos); +	} + +	kfree(per_cpu(threshold_banks, cpu)[bank]->blocks); +	per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; +} +  static __cpuinit void threshold_remove_bank(unsigned int cpu, int bank)  { +	int i = 0;  	struct threshold_bank *b; -	char name[16]; +	char name[32];  	b = per_cpu(threshold_banks, cpu)[bank]; +  	if (!b)  		return; -	if (shared_bank[bank] && atomic_read(&b->kobj.kref.refcount) > 2) { -		sprintf(name, "bank%i", bank); -		sysfs_remove_link(&per_cpu(device_threshold, cpu).kobj, name); -		per_cpu(threshold_banks, cpu)[bank] = NULL; -	} else { -		kobject_unregister(&b->kobj); -		kfree(per_cpu(threshold_banks, cpu)[bank]); + +	if (!b->blocks) +		goto free_out; + +	sprintf(name, "threshold_bank%i", bank); + +	/* sibling symlink */ +	if (shared_bank[bank] && b->blocks->cpu != cpu) { +		sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); +		per_cpu(threshold_banks, i)[bank] = NULL; +		return; +	} + +	/* remove all sibling symlinks before unregistering */ +	for_each_cpu_mask(i, b->cpus) { +		if (i == cpu) +			continue; + +		sysfs_remove_link(&per_cpu(device_mce, i).kobj, name); +		per_cpu(threshold_banks, i)[bank] = NULL;  	} + +	deallocate_threshold_block(cpu, bank); + +free_out: +	kobject_unregister(&b->kobj); +	kfree(b); +	per_cpu(threshold_banks, cpu)[bank] = NULL;  }  static __cpuinit void threshold_remove_device(unsigned int cpu)  { -	int bank; +	unsigned int bank;  	for (bank = 0; bank < NR_BANKS; ++bank) {  		if (!(per_cpu(bank_map, cpu) & 1 << bank))  			continue;  		threshold_remove_bank(cpu, bank);  	} -	sysdev_unregister(&per_cpu(device_threshold, cpu));  } -/* link all existing siblings when first core comes up */ -static __cpuinit int threshold_create_symlinks(unsigned int cpu) -{ -	int bank, err = 0; -	unsigned int lcpu = 0; - -	if (cpu_core_id[cpu]) -		return 0; -	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { -		if (lcpu == cpu) -			continue; -		for (bank = 0; bank < NR_BANKS; ++bank) { -			if (!(per_cpu(bank_map, cpu) & 1 << bank)) -				continue; -			if (!shared_bank[bank]) -				continue; -			err = threshold_create_bank(lcpu, bank); -		} -	} -	return err; -} - -/* remove all symlinks before first core dies. */ -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) -{ -	int bank; -	unsigned int lcpu = 0; -	if (cpu_core_id[cpu]) -		return; -	for_each_cpu_mask(lcpu, cpu_core_map[cpu]) { -		if (lcpu == cpu) -			continue; -		for (bank = 0; bank < NR_BANKS; ++bank) { -			if (!(per_cpu(bank_map, cpu) & 1 << bank)) -				continue; -			if (!shared_bank[bank]) -				continue; -			threshold_remove_bank(lcpu, bank); -		} -	} -}  #else /* !CONFIG_HOTPLUG_CPU */ -static __cpuinit void threshold_create_symlinks(unsigned int cpu) -{ -} -static __cpuinit void threshold_remove_symlinks(unsigned int cpu) -{ -}  static void threshold_remove_device(unsigned int cpu)  {  }  #endif  /* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, +static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,  					    unsigned long action, void *hcpu)  {  	/* cpu was unsigned int to begin with */ @@ -494,13 +648,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb,  	switch (action) {  	case CPU_ONLINE:  		threshold_create_device(cpu); -		threshold_create_symlinks(cpu); -		break; -	case CPU_DOWN_PREPARE: -		threshold_remove_symlinks(cpu); -		break; -	case CPU_DOWN_FAILED: -		threshold_create_symlinks(cpu);  		break;  	case CPU_DEAD:  		threshold_remove_device(cpu); @@ -512,29 +659,22 @@ static int threshold_cpu_callback(struct notifier_block *nfb,  	return NOTIFY_OK;  } -static struct notifier_block threshold_cpu_notifier = { +static struct notifier_block threshold_cpu_notifier __cpuinitdata = {  	.notifier_call = threshold_cpu_callback,  };  static __init int threshold_init_device(void)  { -	int err; -	int lcpu = 0; - -	err = sysdev_class_register(&threshold_sysclass); -	if (err) -		goto out; +	unsigned lcpu = 0;  	/* to hit CPUs online before the notifier is up */  	for_each_online_cpu(lcpu) { -		err = threshold_create_device(lcpu); +		int err = threshold_create_device(lcpu);  		if (err) -			goto out; +			return err;  	}  	register_cpu_notifier(&threshold_cpu_notifier); - -      out: -	return err; +	return 0;  }  device_initcall(threshold_init_device); diff --git a/arch/x86_64/kernel/module.c b/arch/x86_64/kernel/module.c index bac195c74bcc..9d0958ff547f 100644 --- a/arch/x86_64/kernel/module.c +++ b/arch/x86_64/kernel/module.c @@ -145,26 +145,38 @@ int apply_relocate(Elf_Shdr *sechdrs,  	return -ENOSYS;  }  -extern void apply_alternatives(void *start, void *end);  -  int module_finalize(const Elf_Ehdr *hdr, -		    const Elf_Shdr *sechdrs, -		    struct module *me) +                    const Elf_Shdr *sechdrs, +                    struct module *me)  { -	const Elf_Shdr *s; +	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;  	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; -	/* look for .altinstructions to patch */  -	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {  -		void *seg; 		 -		if (strcmp(".altinstructions", secstrings + s->sh_name)) -			continue; -		seg = (void *)s->sh_addr;  -		apply_alternatives(seg, seg + s->sh_size);  -	} 	 +	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { +		if (!strcmp(".text", secstrings + s->sh_name)) +			text = s; +		if (!strcmp(".altinstructions", secstrings + s->sh_name)) +			alt = s; +		if (!strcmp(".smp_locks", secstrings + s->sh_name)) +			locks= s; +	} + +	if (alt) { +		/* patch .altinstructions */ +		void *aseg = (void *)alt->sh_addr; +		apply_alternatives(aseg, aseg + alt->sh_size); +	} +	if (locks && text) { +		void *lseg = (void *)locks->sh_addr; +		void *tseg = (void *)text->sh_addr; +		alternatives_smp_module_add(me, me->name, +					    lseg, lseg + locks->sh_size, +					    tseg, tseg + text->sh_size); +	}  	return 0;  }  void module_arch_cleanup(struct module *mod)  { +	alternatives_smp_module_del(mod);  } diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 4e6357fe0ec3..399489c93132 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -15,11 +15,7 @@  #include <linux/config.h>  #include <linux/mm.h>  #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h>  #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h>  #include <linux/module.h>  #include <linux/sysdev.h>  #include <linux/nmi.h> @@ -27,14 +23,11 @@  #include <linux/kprobes.h>  #include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h>  #include <asm/nmi.h> -#include <asm/msr.h>  #include <asm/proto.h>  #include <asm/kdebug.h> -#include <asm/local.h>  #include <asm/mce.h> +#include <asm/intel_arch_perfmon.h>  /*   * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: @@ -74,6 +67,9 @@ static unsigned int nmi_p4_cccr_val;  #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76  #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +#define ARCH_PERFMON_NMI_EVENT_SEL	ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK	ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK +  #define MSR_P4_MISC_ENABLE	0x1A0  #define MSR_P4_MISC_ENABLE_PERF_AVAIL	(1<<7)  #define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL	(1<<12) @@ -105,7 +101,10 @@ static __cpuinit inline int nmi_known_cpu(void)  	case X86_VENDOR_AMD:  		return boot_cpu_data.x86 == 15;  	case X86_VENDOR_INTEL: -		return boot_cpu_data.x86 == 15; +		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) +			return 1; +		else +			return (boot_cpu_data.x86 == 15);  	}  	return 0;  } @@ -211,6 +210,8 @@ int __init setup_nmi_watchdog(char *str)  __setup("nmi_watchdog=", setup_nmi_watchdog); +static void disable_intel_arch_watchdog(void); +  static void disable_lapic_nmi_watchdog(void)  {  	if (nmi_active <= 0) @@ -223,6 +224,8 @@ static void disable_lapic_nmi_watchdog(void)  		if (boot_cpu_data.x86 == 15) {  			wrmsr(MSR_P4_IQ_CCCR0, 0, 0);  			wrmsr(MSR_P4_CRU_ESCR0, 0, 0); +		} else if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { +			disable_intel_arch_watchdog();  		}  		break;  	} @@ -375,6 +378,53 @@ static void setup_k7_watchdog(void)  	wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);  } +static void disable_intel_arch_watchdog(void) +{ +	unsigned ebx; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Unhalted Core Cycles Event or not. +	 * NOTE: Corresponding bit = 0 in ebp indicates event present. +	 */ +	ebx = cpuid_ebx(10); +	if (!(ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) +		wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, 0, 0); +} + +static int setup_intel_arch_watchdog(void) +{ +	unsigned int evntsel; +	unsigned ebx; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Unhalted Core Cycles Event or not. +	 * NOTE: Corresponding bit = 0 in ebp indicates event present. +	 */ +	ebx = cpuid_ebx(10); +	if ((ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) +		return 0; + +	nmi_perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + +	clear_msr_range(MSR_ARCH_PERFMON_EVENTSEL0, 2); +	clear_msr_range(MSR_ARCH_PERFMON_PERFCTR0, 2); + +	evntsel = ARCH_PERFMON_EVENTSEL_INT +		| ARCH_PERFMON_EVENTSEL_OS +		| ARCH_PERFMON_EVENTSEL_USR +		| ARCH_PERFMON_NMI_EVENT_SEL +		| ARCH_PERFMON_NMI_EVENT_UMASK; + +	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); +	wrmsrl(MSR_ARCH_PERFMON_PERFCTR0, -((u64)cpu_khz * 1000 / nmi_hz)); +	apic_write(APIC_LVTPC, APIC_DM_NMI); +	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; +	wrmsr(MSR_ARCH_PERFMON_EVENTSEL0, evntsel, 0); +	return 1; +} +  static int setup_p4_watchdog(void)  { @@ -428,10 +478,16 @@ void setup_apic_nmi_watchdog(void)  		setup_k7_watchdog();  		break;  	case X86_VENDOR_INTEL: -		if (boot_cpu_data.x86 != 15) -			return; -		if (!setup_p4_watchdog()) +		if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { +			if (!setup_intel_arch_watchdog()) +				return; +		} else if (boot_cpu_data.x86 == 15) { +			if (!setup_p4_watchdog()) +				return; +		} else {  			return; +		} +  		break;  	default: @@ -516,7 +572,14 @@ void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)   			 */   			wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);   			apic_write(APIC_LVTPC, APIC_DM_NMI); - 		} + 		} else if (nmi_perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { +			/* +			 * For Intel based architectural perfmon +			 * - LVTPC is masked on interrupt and must be +			 *   unmasked by the LVTPC handler. +			 */ +			apic_write(APIC_LVTPC, APIC_DM_NMI); +		}  		wrmsrl(nmi_perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz));  	}  } diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c new file mode 100644 index 000000000000..d91cb843f54d --- /dev/null +++ b/arch/x86_64/kernel/pci-calgary.c @@ -0,0 +1,1018 @@ +/* + * Derived from arch/powerpc/kernel/iommu.c + * + * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation + * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/dma-mapping.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/pci_ids.h> +#include <linux/pci.h> +#include <linux/delay.h> +#include <asm/proto.h> +#include <asm/calgary.h> +#include <asm/tce.h> +#include <asm/pci-direct.h> +#include <asm/system.h> +#include <asm/dma.h> + +#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 +#define PCI_VENDOR_DEVICE_ID_CALGARY \ +	(PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) + +/* we need these for register space address calculation */ +#define START_ADDRESS           0xfe000000 +#define CHASSIS_BASE            0 +#define ONE_BASED_CHASSIS_NUM   1 + +/* register offsets inside the host bridge space */ +#define PHB_CSR_OFFSET		0x0110 +#define PHB_PLSSR_OFFSET	0x0120 +#define PHB_CONFIG_RW_OFFSET	0x0160 +#define PHB_IOBASE_BAR_LOW	0x0170 +#define PHB_IOBASE_BAR_HIGH	0x0180 +#define PHB_MEM_1_LOW		0x0190 +#define PHB_MEM_1_HIGH		0x01A0 +#define PHB_IO_ADDR_SIZE	0x01B0 +#define PHB_MEM_1_SIZE		0x01C0 +#define PHB_MEM_ST_OFFSET	0x01D0 +#define PHB_AER_OFFSET		0x0200 +#define PHB_CONFIG_0_HIGH	0x0220 +#define PHB_CONFIG_0_LOW	0x0230 +#define PHB_CONFIG_0_END	0x0240 +#define PHB_MEM_2_LOW		0x02B0 +#define PHB_MEM_2_HIGH		0x02C0 +#define PHB_MEM_2_SIZE_HIGH	0x02D0 +#define PHB_MEM_2_SIZE_LOW	0x02E0 +#define PHB_DOSHOLE_OFFSET	0x08E0 + +/* PHB_CONFIG_RW */ +#define PHB_TCE_ENABLE		0x20000000 +#define PHB_SLOT_DISABLE	0x1C000000 +#define PHB_DAC_DISABLE		0x01000000 +#define PHB_MEM2_ENABLE		0x00400000 +#define PHB_MCSR_ENABLE		0x00100000 +/* TAR (Table Address Register) */ +#define TAR_SW_BITS		0x0000ffffffff800fUL +#define TAR_VALID		0x0000000000000008UL +/* CSR (Channel/DMA Status Register) */ +#define CSR_AGENT_MASK		0xffe0ffff + +#define MAX_NUM_OF_PHBS		8 /* how many PHBs in total? */ +#define MAX_PHB_BUS_NUM		(MAX_NUM_OF_PHBS * 2) /* max dev->bus->number */ +#define PHBS_PER_CALGARY	4 + +/* register offsets in Calgary's internal register space */ +static const unsigned long tar_offsets[] = { +	0x0580 /* TAR0 */, +	0x0588 /* TAR1 */, +	0x0590 /* TAR2 */, +	0x0598 /* TAR3 */ +}; + +static const unsigned long split_queue_offsets[] = { +	0x4870 /* SPLIT QUEUE 0 */, +	0x5870 /* SPLIT QUEUE 1 */, +	0x6870 /* SPLIT QUEUE 2 */, +	0x7870 /* SPLIT QUEUE 3 */ +}; + +static const unsigned long phb_offsets[] = { +	0x8000 /* PHB0 */, +	0x9000 /* PHB1 */, +	0xA000 /* PHB2 */, +	0xB000 /* PHB3 */ +}; + +void* tce_table_kva[MAX_NUM_OF_PHBS * MAX_NUMNODES]; +unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; +static int translate_empty_slots __read_mostly = 0; +static int calgary_detected __read_mostly = 0; + +/* + * the bitmap of PHBs the user requested that we disable + * translation on. + */ +static DECLARE_BITMAP(translation_disabled, MAX_NUMNODES * MAX_PHB_BUS_NUM); + +static void tce_cache_blast(struct iommu_table *tbl); + +/* enable this to stress test the chip's TCE cache */ +#ifdef CONFIG_IOMMU_DEBUG +static inline void tce_cache_blast_stress(struct iommu_table *tbl) +{ +	tce_cache_blast(tbl); +} +#else +static inline void tce_cache_blast_stress(struct iommu_table *tbl) +{ +} +#endif /* BLAST_TCE_CACHE_ON_UNMAP */ + +static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) +{ +	unsigned int npages; + +	npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); +	npages >>= PAGE_SHIFT; + +	return npages; +} + +static inline int translate_phb(struct pci_dev* dev) +{ +	int disabled = test_bit(dev->bus->number, translation_disabled); +	return !disabled; +} + +static void iommu_range_reserve(struct iommu_table *tbl, +        unsigned long start_addr, unsigned int npages) +{ +	unsigned long index; +	unsigned long end; + +	index = start_addr >> PAGE_SHIFT; + +	/* bail out if we're asked to reserve a region we don't cover */ +	if (index >= tbl->it_size) +		return; + +	end = index + npages; +	if (end > tbl->it_size) /* don't go off the table */ +		end = tbl->it_size; + +	while (index < end) { +		if (test_bit(index, tbl->it_map)) +			printk(KERN_ERR "Calgary: entry already allocated at " +			       "0x%lx tbl %p dma 0x%lx npages %u\n", +			       index, tbl, start_addr, npages); +		++index; +	} +	set_bit_string(tbl->it_map, start_addr >> PAGE_SHIFT, npages); +} + +static unsigned long iommu_range_alloc(struct iommu_table *tbl, +	unsigned int npages) +{ +	unsigned long offset; + +	BUG_ON(npages == 0); + +	offset = find_next_zero_string(tbl->it_map, tbl->it_hint, +				       tbl->it_size, npages); +	if (offset == ~0UL) { +		tce_cache_blast(tbl); +		offset = find_next_zero_string(tbl->it_map, 0, +					       tbl->it_size, npages); +		if (offset == ~0UL) { +			printk(KERN_WARNING "Calgary: IOMMU full.\n"); +			if (panic_on_overflow) +				panic("Calgary: fix the allocator.\n"); +			else +				return bad_dma_address; +		} +	} + +	set_bit_string(tbl->it_map, offset, npages); +	tbl->it_hint = offset + npages; +	BUG_ON(tbl->it_hint > tbl->it_size); + +	return offset; +} + +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, +	unsigned int npages, int direction) +{ +	unsigned long entry, flags; +	dma_addr_t ret = bad_dma_address; + +	spin_lock_irqsave(&tbl->it_lock, flags); + +	entry = iommu_range_alloc(tbl, npages); + +	if (unlikely(entry == bad_dma_address)) +		goto error; + +	/* set the return dma address */ +	ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); + +	/* put the TCEs in the HW table */ +	tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, +		  direction); + +	spin_unlock_irqrestore(&tbl->it_lock, flags); + +	return ret; + +error: +	spin_unlock_irqrestore(&tbl->it_lock, flags); +	printk(KERN_WARNING "Calgary: failed to allocate %u pages in " +	       "iommu %p\n", npages, tbl); +	return bad_dma_address; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +	unsigned int npages) +{ +	unsigned long entry; +	unsigned long i; + +	entry = dma_addr >> PAGE_SHIFT; + +	BUG_ON(entry + npages > tbl->it_size); + +	tce_free(tbl, entry, npages); + +	for (i = 0; i < npages; ++i) { +		if (!test_bit(entry + i, tbl->it_map)) +			printk(KERN_ERR "Calgary: bit is off at 0x%lx " +			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", +			       entry + i, tbl, dma_addr, entry, npages); +	} + +	__clear_bit_string(tbl->it_map, entry, npages); + +	tce_cache_blast_stress(tbl); +} + +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +	unsigned int npages) +{ +	unsigned long flags; + +	spin_lock_irqsave(&tbl->it_lock, flags); + +	__iommu_free(tbl, dma_addr, npages); + +	spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static void __calgary_unmap_sg(struct iommu_table *tbl, +	struct scatterlist *sglist, int nelems, int direction) +{ +	while (nelems--) { +		unsigned int npages; +		dma_addr_t dma = sglist->dma_address; +		unsigned int dmalen = sglist->dma_length; + +		if (dmalen == 0) +			break; + +		npages = num_dma_pages(dma, dmalen); +		__iommu_free(tbl, dma, npages); +		sglist++; +	} +} + +void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, +		      int nelems, int direction) +{ +	unsigned long flags; +	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + +	if (!translate_phb(to_pci_dev(dev))) +		return; + +	spin_lock_irqsave(&tbl->it_lock, flags); + +	__calgary_unmap_sg(tbl, sglist, nelems, direction); + +	spin_unlock_irqrestore(&tbl->it_lock, flags); +} + +static int calgary_nontranslate_map_sg(struct device* dev, +	struct scatterlist *sg, int nelems, int direction) +{ +	int i; + + 	for (i = 0; i < nelems; i++ ) { +		struct scatterlist *s = &sg[i]; +		BUG_ON(!s->page); +		s->dma_address = virt_to_bus(page_address(s->page) +s->offset); +		s->dma_length = s->length; +	} +	return nelems; +} + +int calgary_map_sg(struct device *dev, struct scatterlist *sg, +	int nelems, int direction) +{ +	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; +	unsigned long flags; +	unsigned long vaddr; +	unsigned int npages; +	unsigned long entry; +	int i; + +	if (!translate_phb(to_pci_dev(dev))) +		return calgary_nontranslate_map_sg(dev, sg, nelems, direction); + +	spin_lock_irqsave(&tbl->it_lock, flags); + +	for (i = 0; i < nelems; i++ ) { +		struct scatterlist *s = &sg[i]; +		BUG_ON(!s->page); + +		vaddr = (unsigned long)page_address(s->page) + s->offset; +		npages = num_dma_pages(vaddr, s->length); + +		entry = iommu_range_alloc(tbl, npages); +		if (entry == bad_dma_address) { +			/* makes sure unmap knows to stop */ +			s->dma_length = 0; +			goto error; +		} + +		s->dma_address = (entry << PAGE_SHIFT) | s->offset; + +		/* insert into HW table */ +		tce_build(tbl, entry, npages, vaddr & PAGE_MASK, +			  direction); + +		s->dma_length = s->length; +	} + +	spin_unlock_irqrestore(&tbl->it_lock, flags); + +	return nelems; +error: +	__calgary_unmap_sg(tbl, sg, nelems, direction); +	for (i = 0; i < nelems; i++) { +		sg[i].dma_address = bad_dma_address; +		sg[i].dma_length = 0; +	} +	spin_unlock_irqrestore(&tbl->it_lock, flags); +	return 0; +} + +dma_addr_t calgary_map_single(struct device *dev, void *vaddr, +	size_t size, int direction) +{ +	dma_addr_t dma_handle = bad_dma_address; +	unsigned long uaddr; +	unsigned int npages; +	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + +	uaddr = (unsigned long)vaddr; +	npages = num_dma_pages(uaddr, size); + +	if (translate_phb(to_pci_dev(dev))) +		dma_handle = iommu_alloc(tbl, vaddr, npages, direction); +	else +		dma_handle = virt_to_bus(vaddr); + +	return dma_handle; +} + +void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, +	size_t size, int direction) +{ +	struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; +	unsigned int npages; + +	if (!translate_phb(to_pci_dev(dev))) +		return; + +	npages = num_dma_pages(dma_handle, size); +	iommu_free(tbl, dma_handle, npages); +} + +void* calgary_alloc_coherent(struct device *dev, size_t size, +	dma_addr_t *dma_handle, gfp_t flag) +{ +	void *ret = NULL; +	dma_addr_t mapping; +	unsigned int npages, order; +	struct iommu_table *tbl; + +	tbl = to_pci_dev(dev)->bus->self->sysdata; + +	size = PAGE_ALIGN(size); /* size rounded up to full pages */ +	npages = size >> PAGE_SHIFT; +	order = get_order(size); + +	/* alloc enough pages (and possibly more) */ +	ret = (void *)__get_free_pages(flag, order); +	if (!ret) +		goto error; +	memset(ret, 0, size); + +	if (translate_phb(to_pci_dev(dev))) { +		/* set up tces to cover the allocated range */ +		mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); +		if (mapping == bad_dma_address) +			goto free; + +		*dma_handle = mapping; +	} else /* non translated slot */ +		*dma_handle = virt_to_bus(ret); + +	return ret; + +free: +	free_pages((unsigned long)ret, get_order(size)); +	ret = NULL; +error: +	return ret; +} + +static struct dma_mapping_ops calgary_dma_ops = { +	.alloc_coherent = calgary_alloc_coherent, +	.map_single = calgary_map_single, +	.unmap_single = calgary_unmap_single, +	.map_sg = calgary_map_sg, +	.unmap_sg = calgary_unmap_sg, +}; + +static inline int busno_to_phbid(unsigned char num) +{ +	return bus_to_phb(num) % PHBS_PER_CALGARY; +} + +static inline unsigned long split_queue_offset(unsigned char num) +{ +	size_t idx = busno_to_phbid(num); + +	return split_queue_offsets[idx]; +} + +static inline unsigned long tar_offset(unsigned char num) +{ +	size_t idx = busno_to_phbid(num); + +	return tar_offsets[idx]; +} + +static inline unsigned long phb_offset(unsigned char num) +{ +	size_t idx = busno_to_phbid(num); + +	return phb_offsets[idx]; +} + +static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) +{ +	unsigned long target = ((unsigned long)bar) | offset; +	return (void __iomem*)target; +} + +static void tce_cache_blast(struct iommu_table *tbl) +{ +	u64 val; +	u32 aer; +	int i = 0; +	void __iomem *bbar = tbl->bbar; +	void __iomem *target; + +	/* disable arbitration on the bus */ +	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); +	aer = readl(target); +	writel(0, target); + +	/* read plssr to ensure it got there */ +	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); +	val = readl(target); + +	/* poll split queues until all DMA activity is done */ +	target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); +	do { +		val = readq(target); +		i++; +	} while ((val & 0xff) != 0xff && i < 100); +	if (i == 100) +		printk(KERN_WARNING "Calgary: PCI bus not quiesced, " +		       "continuing anyway\n"); + +	/* invalidate TCE cache */ +	target = calgary_reg(bbar, tar_offset(tbl->it_busno)); +	writeq(tbl->tar_val, target); + +	/* enable arbitration */ +	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); +	writel(aer, target); +	(void)readl(target); /* flush */ +} + +static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, +	u64 limit) +{ +	unsigned int numpages; + +	limit = limit | 0xfffff; +	limit++; + +	numpages = ((limit - start) >> PAGE_SHIFT); +	iommu_range_reserve(dev->sysdata, start, numpages); +} + +static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) +{ +	void __iomem *target; +	u64 low, high, sizelow; +	u64 start, limit; +	struct iommu_table *tbl = dev->sysdata; +	unsigned char busnum = dev->bus->number; +	void __iomem *bbar = tbl->bbar; + +	/* peripheral MEM_1 region */ +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); +	low = be32_to_cpu(readl(target)); +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); +	high = be32_to_cpu(readl(target)); +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); +	sizelow = be32_to_cpu(readl(target)); + +	start = (high << 32) | low; +	limit = sizelow; + +	calgary_reserve_mem_region(dev, start, limit); +} + +static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) +{ +	void __iomem *target; +	u32 val32; +	u64 low, high, sizelow, sizehigh; +	u64 start, limit; +	struct iommu_table *tbl = dev->sysdata; +	unsigned char busnum = dev->bus->number; +	void __iomem *bbar = tbl->bbar; + +	/* is it enabled? */ +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); +	val32 = be32_to_cpu(readl(target)); +	if (!(val32 & PHB_MEM2_ENABLE)) +		return; + +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); +	low = be32_to_cpu(readl(target)); +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); +	high = be32_to_cpu(readl(target)); +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); +	sizelow = be32_to_cpu(readl(target)); +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); +	sizehigh = be32_to_cpu(readl(target)); + +	start = (high << 32) | low; +	limit = (sizehigh << 32) | sizelow; + +	calgary_reserve_mem_region(dev, start, limit); +} + +/* + * some regions of the IO address space do not get translated, so we + * must not give devices IO addresses in those regions. The regions + * are the 640KB-1MB region and the two PCI peripheral memory holes. + * Reserve all of them in the IOMMU bitmap to avoid giving them out + * later. + */ +static void __init calgary_reserve_regions(struct pci_dev *dev) +{ +	unsigned int npages; +	void __iomem *bbar; +	unsigned char busnum; +	u64 start; +	struct iommu_table *tbl = dev->sysdata; + +	bbar = tbl->bbar; +	busnum = dev->bus->number; + +	/* reserve bad_dma_address in case it's a legal address */ +	iommu_range_reserve(tbl, bad_dma_address, 1); + +	/* avoid the BIOS/VGA first 640KB-1MB region */ +	start = (640 * 1024); +	npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; +	iommu_range_reserve(tbl, start, npages); + +	/* reserve the two PCI peripheral memory regions in IO space */ +	calgary_reserve_peripheral_mem_1(dev); +	calgary_reserve_peripheral_mem_2(dev); +} + +static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) +{ +	u64 val64; +	u64 table_phys; +	void __iomem *target; +	int ret; +	struct iommu_table *tbl; + +	/* build TCE tables for each PHB */ +	ret = build_tce_table(dev, bbar); +	if (ret) +		return ret; + +	calgary_reserve_regions(dev); + +	/* set TARs for each PHB */ +	target = calgary_reg(bbar, tar_offset(dev->bus->number)); +	val64 = be64_to_cpu(readq(target)); + +	/* zero out all TAR bits under sw control */ +	val64 &= ~TAR_SW_BITS; + +	tbl = dev->sysdata; +	table_phys = (u64)__pa(tbl->it_base); +	val64 |= table_phys; + +	BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); +	val64 |= (u64) specified_table_size; + +	tbl->tar_val = cpu_to_be64(val64); +	writeq(tbl->tar_val, target); +	readq(target); /* flush */ + +	return 0; +} + +static void __init calgary_free_tar(struct pci_dev *dev) +{ +	u64 val64; +	struct iommu_table *tbl = dev->sysdata; +	void __iomem *target; + +	target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); +	val64 = be64_to_cpu(readq(target)); +	val64 &= ~TAR_SW_BITS; +	writeq(cpu_to_be64(val64), target); +	readq(target); /* flush */ + +	kfree(tbl); +	dev->sysdata = NULL; +} + +static void calgary_watchdog(unsigned long data) +{ +	struct pci_dev *dev = (struct pci_dev *)data; +	struct iommu_table *tbl = dev->sysdata; +	void __iomem *bbar = tbl->bbar; +	u32 val32; +	void __iomem *target; + +	target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); +	val32 = be32_to_cpu(readl(target)); + +	/* If no error, the agent ID in the CSR is not valid */ +	if (val32 & CSR_AGENT_MASK) { +		printk(KERN_EMERG "calgary_watchdog: DMA error on bus %d, " +				  "CSR = %#x\n", dev->bus->number, val32); +		writel(0, target); + +		/* Disable bus that caused the error */ +		target = calgary_reg(bbar, phb_offset(tbl->it_busno) | +					   PHB_CONFIG_RW_OFFSET); +		val32 = be32_to_cpu(readl(target)); +		val32 |= PHB_SLOT_DISABLE; +		writel(cpu_to_be32(val32), target); +		readl(target); /* flush */ +	} else { +		/* Reset the timer */ +		mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); +	} +} + +static void __init calgary_enable_translation(struct pci_dev *dev) +{ +	u32 val32; +	unsigned char busnum; +	void __iomem *target; +	void __iomem *bbar; +	struct iommu_table *tbl; + +	busnum = dev->bus->number; +	tbl = dev->sysdata; +	bbar = tbl->bbar; + +	/* enable TCE in PHB Config Register */ +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); +	val32 = be32_to_cpu(readl(target)); +	val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; + +	printk(KERN_INFO "Calgary: enabling translation on PHB %d\n", busnum); +	printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " +	       "bus.\n"); + +	writel(cpu_to_be32(val32), target); +	readl(target); /* flush */ + +	init_timer(&tbl->watchdog_timer); +	tbl->watchdog_timer.function = &calgary_watchdog; +	tbl->watchdog_timer.data = (unsigned long)dev; +	mod_timer(&tbl->watchdog_timer, jiffies); +} + +static void __init calgary_disable_translation(struct pci_dev *dev) +{ +	u32 val32; +	unsigned char busnum; +	void __iomem *target; +	void __iomem *bbar; +	struct iommu_table *tbl; + +	busnum = dev->bus->number; +	tbl = dev->sysdata; +	bbar = tbl->bbar; + +	/* disable TCE in PHB Config Register */ +	target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); +	val32 = be32_to_cpu(readl(target)); +	val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); + +	printk(KERN_INFO "Calgary: disabling translation on PHB %d!\n", busnum); +	writel(cpu_to_be32(val32), target); +	readl(target); /* flush */ + +	del_timer_sync(&tbl->watchdog_timer); +} + +static inline unsigned int __init locate_register_space(struct pci_dev *dev) +{ +	int rionodeid; +	u32 address; + +	rionodeid = (dev->bus->number % 15 > 4) ? 3 : 2; +	/* +	 * register space address calculation as follows: +	 * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase) +	 * ChassisBase is always zero for x366/x260/x460 +	 * RioNodeId is 2 for first Calgary, 3 for second Calgary +	 */ +	address = START_ADDRESS	- +		(0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 15)) + +		(0x100000) * (rionodeid - CHASSIS_BASE); +	return address; +} + +static int __init calgary_init_one_nontraslated(struct pci_dev *dev) +{ +	dev->sysdata = NULL; +	dev->bus->self = dev; + +	return 0; +} + +static int __init calgary_init_one(struct pci_dev *dev) +{ +	u32 address; +	void __iomem *bbar; +	int ret; + +	address = locate_register_space(dev); +	/* map entire 1MB of Calgary config space */ +	bbar = ioremap_nocache(address, 1024 * 1024); +	if (!bbar) { +		ret = -ENODATA; +		goto done; +	} + +	ret = calgary_setup_tar(dev, bbar); +	if (ret) +		goto iounmap; + +	dev->bus->self = dev; +	calgary_enable_translation(dev); + +	return 0; + +iounmap: +	iounmap(bbar); +done: +	return ret; +} + +static int __init calgary_init(void) +{ +	int i, ret = -ENODEV; +	struct pci_dev *dev = NULL; + +	for (i = 0; i <= num_online_nodes() * MAX_NUM_OF_PHBS; i++) { +		dev = pci_get_device(PCI_VENDOR_ID_IBM, +				     PCI_DEVICE_ID_IBM_CALGARY, +				     dev); +		if (!dev) +			break; +		if (!translate_phb(dev)) { +			calgary_init_one_nontraslated(dev); +			continue; +		} +		if (!tce_table_kva[i] && !translate_empty_slots) { +			pci_dev_put(dev); +			continue; +		} +		ret = calgary_init_one(dev); +		if (ret) +			goto error; +	} + +	return ret; + +error: +	for (i--; i >= 0; i--) { +		dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, +					      PCI_DEVICE_ID_IBM_CALGARY, +					      dev); +		if (!translate_phb(dev)) { +			pci_dev_put(dev); +			continue; +		} +		if (!tce_table_kva[i] && !translate_empty_slots) +			continue; +		calgary_disable_translation(dev); +		calgary_free_tar(dev); +		pci_dev_put(dev); +	} + +	return ret; +} + +static inline int __init determine_tce_table_size(u64 ram) +{ +	int ret; + +	if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) +		return specified_table_size; + +	/* +	 * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to +	 * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each +	 * larger table size has twice as many entries, so shift the +	 * max ram address by 13 to divide by 8K and then look at the +	 * order of the result to choose between 0-7. +	 */ +	ret = get_order(ram >> 13); +	if (ret > TCE_TABLE_SIZE_8M) +		ret = TCE_TABLE_SIZE_8M; + +	return ret; +} + +void __init detect_calgary(void) +{ +	u32 val; +	int bus, table_idx; +	void *tbl; +	int detected = 0; + +	/* +	 * if the user specified iommu=off or iommu=soft or we found +	 * another HW IOMMU already, bail out. +	 */ +	if (swiotlb || no_iommu || iommu_detected) +		return; + +	specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); + +	for (bus = 0, table_idx = 0; +	     bus <= num_online_nodes() * MAX_PHB_BUS_NUM; +	     bus++) { +		BUG_ON(bus > MAX_NUMNODES * MAX_PHB_BUS_NUM); +		if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) +			continue; +		if (test_bit(bus, translation_disabled)) { +			printk(KERN_INFO "Calgary: translation is disabled for " +			       "PHB 0x%x\n", bus); +			/* skip this phb, don't allocate a tbl for it */ +			tce_table_kva[table_idx] = NULL; +			table_idx++; +			continue; +		} +		/* +		 * scan the first slot of the PCI bus to see if there +		 * are any devices present +		 */ +		val = read_pci_config(bus, 1, 0, 0); +		if (val != 0xffffffff || translate_empty_slots) { +			tbl = alloc_tce_table(); +			if (!tbl) +				goto cleanup; +			detected = 1; +		} else +			tbl = NULL; + +		tce_table_kva[table_idx] = tbl; +		table_idx++; +	} + +	if (detected) { +		iommu_detected = 1; +		calgary_detected = 1; +		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected. " +		       "TCE table spec is %d.\n", specified_table_size); +	} +	return; + +cleanup: +	for (--table_idx; table_idx >= 0; --table_idx) +		if (tce_table_kva[table_idx]) +			free_tce_table(tce_table_kva[table_idx]); +} + +int __init calgary_iommu_init(void) +{ +	int ret; + +	if (no_iommu || swiotlb) +		return -ENODEV; + +	if (!calgary_detected) +		return -ENODEV; + +	/* ok, we're trying to use Calgary - let's roll */ +	printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); + +	ret = calgary_init(); +	if (ret) { +		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " +		       "falling back to no_iommu\n", ret); +		if (end_pfn > MAX_DMA32_PFN) +			printk(KERN_ERR "WARNING more than 4GB of memory, " +					"32bit PCI may malfunction.\n"); +		return ret; +	} + +	force_iommu = 1; +	dma_ops = &calgary_dma_ops; + +	return 0; +} + +static int __init calgary_parse_options(char *p) +{ +	unsigned int bridge; +	size_t len; +	char* endp; + +	while (*p) { +		if (!strncmp(p, "64k", 3)) +			specified_table_size = TCE_TABLE_SIZE_64K; +		else if (!strncmp(p, "128k", 4)) +			specified_table_size = TCE_TABLE_SIZE_128K; +		else if (!strncmp(p, "256k", 4)) +			specified_table_size = TCE_TABLE_SIZE_256K; +		else if (!strncmp(p, "512k", 4)) +			specified_table_size = TCE_TABLE_SIZE_512K; +		else if (!strncmp(p, "1M", 2)) +			specified_table_size = TCE_TABLE_SIZE_1M; +		else if (!strncmp(p, "2M", 2)) +			specified_table_size = TCE_TABLE_SIZE_2M; +		else if (!strncmp(p, "4M", 2)) +			specified_table_size = TCE_TABLE_SIZE_4M; +		else if (!strncmp(p, "8M", 2)) +			specified_table_size = TCE_TABLE_SIZE_8M; + +		len = strlen("translate_empty_slots"); +		if (!strncmp(p, "translate_empty_slots", len)) +			translate_empty_slots = 1; + +		len = strlen("disable"); +		if (!strncmp(p, "disable", len)) { +			p += len; +			if (*p == '=') +				++p; +			if (*p == '\0') +				break; +			bridge = simple_strtol(p, &endp, 0); +			if (p == endp) +				break; + +			if (bridge <= (num_online_nodes() * MAX_PHB_BUS_NUM)) { +				printk(KERN_INFO "Calgary: disabling " +				       "translation for PHB 0x%x\n", bridge); +				set_bit(bridge, translation_disabled); +			} +		} + +		p = strpbrk(p, ","); +		if (!p) +			break; + +		p++; /* skip ',' */ +	} +	return 1; +} +__setup("calgary=", calgary_parse_options); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index a9275c9557cf..9c44f4f2433d 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -9,6 +9,7 @@  #include <linux/module.h>  #include <asm/io.h>  #include <asm/proto.h> +#include <asm/calgary.h>  int iommu_merge __read_mostly = 0;  EXPORT_SYMBOL(iommu_merge); @@ -33,12 +34,15 @@ int panic_on_overflow __read_mostly = 0;  int force_iommu __read_mostly= 0;  #endif +/* Set this to 1 if there is a HW IOMMU in the system */ +int iommu_detected __read_mostly = 0; +  /* Dummy device used for NULL arguments (normally ISA). Better would     be probably a smaller DMA mask, but this is bug-to-bug compatible     to i386. */  struct device fallback_dev = {  	.bus_id = "fallback device", -	.coherent_dma_mask = 0xffffffff, +	.coherent_dma_mask = DMA_32BIT_MASK,  	.dma_mask = &fallback_dev.coherent_dma_mask,  }; @@ -77,7 +81,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,  		dev = &fallback_dev;  	dma_mask = dev->coherent_dma_mask;  	if (dma_mask == 0) -		dma_mask = 0xffffffff; +		dma_mask = DMA_32BIT_MASK;  	/* Don't invoke OOM killer */  	gfp |= __GFP_NORETRY; @@ -90,7 +94,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,  	   larger than 16MB and in this case we have a chance of  	   finding fitting memory in the next higher zone first. If  	   not retry with true GFP_DMA. -AK */ -	if (dma_mask <= 0xffffffff) +	if (dma_mask <= DMA_32BIT_MASK)  		gfp |= GFP_DMA32;   again: @@ -111,7 +115,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,  			/* Don't use the 16MB ZONE_DMA unless absolutely  			   needed. It's better to use remapping first. */ -			if (dma_mask < 0xffffffff && !(gfp & GFP_DMA)) { +			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {  				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;  				goto again;  			} @@ -174,7 +178,7 @@ int dma_supported(struct device *dev, u64 mask)  	/* Copied from i386. Doesn't make much sense, because it will  	   only work for pci_alloc_coherent.  	   The caller just has to use GFP_DMA in this case. */ -        if (mask < 0x00ffffff) +        if (mask < DMA_24BIT_MASK)                  return 0;  	/* Tell the device to use SAC when IOMMU force is on.  This @@ -189,7 +193,7 @@ int dma_supported(struct device *dev, u64 mask)  	   SAC for these.  Assume all masks <= 40 bits are of this  	   type. Normally this doesn't make any difference, but gives  	   more gentle handling of IOMMU overflow. */ -	if (iommu_sac_force && (mask >= 0xffffffffffULL)) { +	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {  		printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);  		return 0;  	} @@ -266,7 +270,7 @@ __init int iommu_setup(char *p)  		    swiotlb = 1;  #endif -#ifdef CONFIG_GART_IOMMU +#ifdef CONFIG_IOMMU  	    gart_parse_options(p);  #endif @@ -276,3 +280,40 @@ __init int iommu_setup(char *p)      }      return 1;  } +__setup("iommu=", iommu_setup); + +void __init pci_iommu_alloc(void) +{ +	/* +	 * The order of these functions is important for +	 * fall-back/fail-over reasons +	 */ +#ifdef CONFIG_IOMMU +	iommu_hole_init(); +#endif + +#ifdef CONFIG_CALGARY_IOMMU +	detect_calgary(); +#endif + +#ifdef CONFIG_SWIOTLB +	pci_swiotlb_init(); +#endif +} + +static int __init pci_iommu_init(void) +{ +#ifdef CONFIG_CALGARY_IOMMU +	calgary_iommu_init(); +#endif + +#ifdef CONFIG_IOMMU +	gart_iommu_init(); +#endif + +	no_iommu_init(); +	return 0; +} + +/* Must execute after PCI subsystem */ +fs_initcall(pci_iommu_init); diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 82a7c9bfdfa0..4ca674d16b09 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -32,6 +32,7 @@  #include <asm/kdebug.h>  #include <asm/swiotlb.h>  #include <asm/dma.h> +#include <asm/k8.h>  unsigned long iommu_bus_base;	/* GART remapping area (physical) */  static unsigned long iommu_size; 	/* size of remapping area bytes */ @@ -46,8 +47,6 @@ u32 *iommu_gatt_base; 		/* Remapping table */     also seen with Qlogic at least). */  int iommu_fullflush = 1; -#define MAX_NB 8 -  /* Allocation bitmap for the remapping area */   static DEFINE_SPINLOCK(iommu_bitmap_lock);  static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ @@ -63,13 +62,6 @@ static u32 gart_unmapped_entry;  #define to_pages(addr,size) \  	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define for_all_nb(dev) \ -	dev = NULL;	\ -	while ((dev = pci_get_device(PCI_VENDOR_ID_AMD, 0x1103, dev))!=NULL) - -static struct pci_dev *northbridges[MAX_NB]; -static u32 northbridge_flush_word[MAX_NB]; -  #define EMERGENCY_PAGES 32 /* = 128KB */   #ifdef CONFIG_AGP @@ -93,7 +85,7 @@ static unsigned long alloc_iommu(int size)  	offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size);  	if (offset == -1) {  		need_flush = 1; -	       	offset = find_next_zero_string(iommu_gart_bitmap,0,next_bit,size); +		offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size);  	}  	if (offset != -1) {   		set_bit_string(iommu_gart_bitmap, offset, size);  @@ -120,44 +112,17 @@ static void free_iommu(unsigned long offset, int size)  /*    * Use global flush state to avoid races with multiple flushers.   */ -static void flush_gart(struct device *dev) +static void flush_gart(void)  {   	unsigned long flags; -	int flushed = 0; -	int i, max; -  	spin_lock_irqsave(&iommu_bitmap_lock, flags); -	if (need_flush) {  -		max = 0; -		for (i = 0; i < MAX_NB; i++) { -			if (!northbridges[i])  -				continue; -			pci_write_config_dword(northbridges[i], 0x9c,  -					       northbridge_flush_word[i] | 1);  -			flushed++; -			max = i; -		} -		for (i = 0; i <= max; i++) { -			u32 w; -			if (!northbridges[i]) -				continue; -			/* Make sure the hardware actually executed the flush. */ -			for (;;) {  -				pci_read_config_dword(northbridges[i], 0x9c, &w); -				if (!(w & 1)) -					break; -				cpu_relax(); -			} -		}  -		if (!flushed)  -			printk("nothing to flush?\n"); +	if (need_flush) { +		k8_flush_garts();  		need_flush = 0;  	}   	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);  }  - -  #ifdef CONFIG_IOMMU_LEAK  #define SET_LEAK(x) if (iommu_leak_tab) \ @@ -266,7 +231,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf,  				 size_t size, int dir)  {  	dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); -	flush_gart(dev); +	flush_gart();  	return map;  } @@ -289,6 +254,28 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir)  }  /* + * Free a DMA mapping. + */ +void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, +		      size_t size, int direction) +{ +	unsigned long iommu_page; +	int npages; +	int i; + +	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || +	    dma_addr >= iommu_bus_base + iommu_size) +		return; +	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; +	npages = to_pages(dma_addr, size); +	for (i = 0; i < npages; i++) { +		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; +		CLEAR_LEAK(iommu_page + i); +	} +	free_iommu(iommu_page, npages); +} + +/*   * Wrapper for pci_unmap_single working with scatterlists.   */  void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) @@ -299,7 +286,7 @@ void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int di  		struct scatterlist *s = &sg[i];  		if (!s->dma_length || !s->length)  			break; -		dma_unmap_single(dev, s->dma_address, s->dma_length, dir); +		gart_unmap_single(dev, s->dma_address, s->dma_length, dir);  	}  } @@ -329,7 +316,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,  		s->dma_address = addr;  		s->dma_length = s->length;  	} -	flush_gart(dev); +	flush_gart();  	return nents;  } @@ -436,13 +423,13 @@ int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)  	if (dma_map_cont(sg, start, i, sg+out, pages, need) < 0)  		goto error;  	out++; -	flush_gart(dev); +	flush_gart();  	if (out < nents)   		sg[out].dma_length = 0;   	return out;  error: -	flush_gart(NULL); +	flush_gart();  	gart_unmap_sg(dev, sg, nents, dir);  	/* When it was forced or merged try again in a dumb way */  	if (force_iommu || iommu_merge) { @@ -458,28 +445,6 @@ error:  	return 0;  }  -/* - * Free a DMA mapping. - */  -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, -		      size_t size, int direction) -{ -	unsigned long iommu_page;  -	int npages; -	int i; - -	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||  -	    dma_addr >= iommu_bus_base + iommu_size) -		return; -	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;	 -	npages = to_pages(dma_addr, size); -	for (i = 0; i < npages; i++) {  -		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;  -		CLEAR_LEAK(iommu_page + i); -	} -	free_iommu(iommu_page, npages); -} -  static int no_agp;  static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -532,10 +497,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)  	void *gatt;  	unsigned aper_base, new_aper_base;  	unsigned aper_size, gatt_size, new_aper_size; -	 +	int i; +  	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");  	aper_size = aper_base = info->aper_size = 0; -	for_all_nb(dev) {  +	dev = NULL; +	for (i = 0; i < num_k8_northbridges; i++) { +		dev = k8_northbridges[i];  		new_aper_base = read_aperture(dev, &new_aper_size);   		if (!new_aper_base)   			goto nommu;  @@ -558,11 +526,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info)  		panic("Cannot allocate GATT table");   	memset(gatt, 0, gatt_size);   	agp_gatt_table = gatt; -	 -	for_all_nb(dev) {  + +	for (i = 0; i < num_k8_northbridges; i++) {  		u32 ctl;   		u32 gatt_reg;  +		dev = k8_northbridges[i];  		gatt_reg = __pa(gatt) >> 12;   		gatt_reg <<= 4;   		pci_write_config_dword(dev, 0x98, gatt_reg); @@ -573,7 +542,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)  		pci_write_config_dword(dev, 0x90, ctl);   	} -	flush_gart(NULL);  +	flush_gart();  	printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10);   	return 0; @@ -602,15 +571,19 @@ static struct dma_mapping_ops gart_dma_ops = {  	.unmap_sg = gart_unmap_sg,  }; -static int __init pci_iommu_init(void) +void __init gart_iommu_init(void)  {   	struct agp_kern_info info;  	unsigned long aper_size;  	unsigned long iommu_start; -	struct pci_dev *dev;  	unsigned long scratch;  	long i; +	if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) { +		printk(KERN_INFO "PCI-GART: No AMD northbridge found.\n"); +		return; +	} +  #ifndef CONFIG_AGP_AMD64  	no_agp = 1;   #else @@ -622,7 +595,11 @@ static int __init pci_iommu_init(void)  #endif	  	if (swiotlb) -		return -1;  +		return; + +	/* Did we detect a different HW IOMMU? */ +	if (iommu_detected && !iommu_aperture) +		return;  	if (no_iommu ||  	    (!force_iommu && end_pfn <= MAX_DMA32_PFN) || @@ -634,15 +611,7 @@ static int __init pci_iommu_init(void)  					"but IOMMU not available.\n"  			       KERN_ERR "WARNING 32bit PCI may malfunction.\n");  		} -		return -1; -	} - -	i = 0; -	for_all_nb(dev) -		i++; -	if (i > MAX_NB) { -		printk(KERN_ERR "PCI-GART: Too many northbridges (%ld). Disabled\n", i); -		return -1; +		return;  	}  	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); @@ -707,26 +676,10 @@ static int __init pci_iommu_init(void)  	for (i = EMERGENCY_PAGES; i < iommu_pages; i++)   		iommu_gatt_base[i] = gart_unmapped_entry; -	for_all_nb(dev) { -		u32 flag;  -		int cpu = PCI_SLOT(dev->devfn) - 24; -		if (cpu >= MAX_NB) -			continue; -		northbridges[cpu] = dev; -		pci_read_config_dword(dev, 0x9c, &flag); /* cache flush word */ -		northbridge_flush_word[cpu] = flag;  -	} -		      -	flush_gart(NULL); - +	flush_gart();  	dma_ops = &gart_dma_ops; - -	return 0;  }  -/* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); -  void gart_parse_options(char *p)  {  	int arg; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 1f6ecc62061d..c4c3cc36ac5b 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -4,6 +4,8 @@  #include <linux/init.h>  #include <linux/pci.h>  #include <linux/string.h> +#include <linux/dma-mapping.h> +  #include <asm/proto.h>  #include <asm/processor.h>  #include <asm/dma.h> @@ -12,10 +14,11 @@ static int  check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)  {          if (hwdev && bus + size > *hwdev->dma_mask) { -		if (*hwdev->dma_mask >= 0xffffffffULL) +		if (*hwdev->dma_mask >= DMA_32BIT_MASK)  			printk(KERN_ERR -			    "nommu_%s: overflow %Lx+%lu of device mask %Lx\n", -	       			name, (long long)bus, size, (long long)*hwdev->dma_mask); +			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", +				name, (long long)bus, size, +				(long long)*hwdev->dma_mask);  		return 0;  	}  	return 1; diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 990ed67896f2..ebdb77fe2057 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -31,7 +31,7 @@ struct dma_mapping_ops swiotlb_dma_ops = {  void pci_swiotlb_init(void)  {  	/* don't initialize swiotlb if iommu=off (no_iommu=1) */ -	if (!iommu_aperture && !no_iommu && +	if (!iommu_detected && !no_iommu &&  	    (end_pfn > MAX_DMA32_PFN || force_iommu))  	       swiotlb = 1;  	if (swiotlb) { diff --git a/arch/x86_64/kernel/pmtimer.c b/arch/x86_64/kernel/pmtimer.c index bf421ed26808..7554458dc9cb 100644 --- a/arch/x86_64/kernel/pmtimer.c +++ b/arch/x86_64/kernel/pmtimer.c @@ -27,7 +27,7 @@  /* The I/O port the PMTMR resides at.   * The location is detected during setup_arch(),   * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport; +u32 pmtmr_ioport __read_mostly;  /* value of the Power timer at last timer interrupt */  static u32 offset_delay; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index fb903e65e079..ca56e19b8b6e 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -10,7 +10,6 @@   *	Andi Kleen.   *   *	CPU hotplug support - ashok.raj@intel.com - *  $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $   */  /* @@ -64,6 +63,7 @@ EXPORT_SYMBOL(boot_option_idle_override);   * Powermanagement idle function, if any..   */  void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle);  static DEFINE_PER_CPU(unsigned int, cpu_idle_state);  static ATOMIC_NOTIFIER_HEAD(idle_notifier); @@ -111,7 +111,7 @@ static void default_idle(void)  {  	local_irq_enable(); -	clear_thread_flag(TIF_POLLING_NRFLAG); +	current_thread_info()->status &= ~TS_POLLING;  	smp_mb__after_clear_bit();  	while (!need_resched()) {  		local_irq_disable(); @@ -120,7 +120,7 @@ static void default_idle(void)  		else  			local_irq_enable();  	} -	set_thread_flag(TIF_POLLING_NRFLAG); +	current_thread_info()->status |= TS_POLLING;  }  /* @@ -203,8 +203,7 @@ static inline void play_dead(void)   */  void cpu_idle (void)  { -	set_thread_flag(TIF_POLLING_NRFLAG); - +	current_thread_info()->status |= TS_POLLING;  	/* endless idle loop with no priority at all */  	while (1) {  		while (!need_resched()) { @@ -335,7 +334,7 @@ void show_regs(struct pt_regs *regs)  {  	printk("CPU %d:", smp_processor_id());  	__show_regs(regs); -	show_trace(®s->rsp); +	show_trace(NULL, regs, (void *)(regs + 1));  }  /* @@ -365,8 +364,11 @@ void flush_thread(void)  	struct task_struct *tsk = current;  	struct thread_info *t = current_thread_info(); -	if (t->flags & _TIF_ABI_PENDING) +	if (t->flags & _TIF_ABI_PENDING) {  		t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); +		if (t->flags & _TIF_IA32) +			current_thread_info()->status |= TS_COMPAT; +	}  	tsk->thread.debugreg0 = 0;  	tsk->thread.debugreg1 = 0; diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 57117b8beb2b..2d6769847456 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -20,6 +20,7 @@   * Power off function, if any   */  void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off);  static long no_idt[3];  static enum {  diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fb850b52b4da..1129918ede82 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -5,8 +5,6 @@   *   *  Nov 2001 Dave Jones <davej@suse.de>   *  Forked from i386 setup code. - * - *  $Id$   */  /* @@ -65,9 +63,7 @@  #include <asm/setup.h>  #include <asm/mach_apic.h>  #include <asm/numa.h> -#include <asm/swiotlb.h>  #include <asm/sections.h> -#include <asm/gart-mapping.h>  #include <asm/dmi.h>  /* @@ -75,6 +71,7 @@   */  struct cpuinfo_x86 boot_cpu_data __read_mostly; +EXPORT_SYMBOL(boot_cpu_data);  unsigned long mmu_cr4_features; @@ -103,12 +100,14 @@ char dmi_alloc_data[DMI_MAX_DATA];   * Setup options   */  struct screen_info screen_info; +EXPORT_SYMBOL(screen_info);  struct sys_desc_table_struct {  	unsigned short length;  	unsigned char table[0];  };  struct edid_info edid_info; +EXPORT_SYMBOL_GPL(edid_info);  struct e820map e820;  extern int root_mountflags; @@ -473,80 +472,6 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)  }   #endif -/* Use inline assembly to define this because the nops are defined  -   as inline assembly strings in the include files and we cannot  -   get them easily into strings. */ -asm("\t.data\nk8nops: "  -    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 -    K8_NOP7 K8_NOP8);  -     -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = {  -     NULL, -     k8nops, -     k8nops + 1, -     k8nops + 1 + 2, -     k8nops + 1 + 2 + 3, -     k8nops + 1 + 2 + 3 + 4, -     k8nops + 1 + 2 + 3 + 4 + 5, -     k8nops + 1 + 2 + 3 + 4 + 5 + 6, -     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -};  - -extern char __vsyscall_0; - -/* Replace instructions with better alternatives for this CPU type. - -   This runs before SMP is initialized to avoid SMP problems with -   self modifying code. This implies that assymetric systems where -   APs have less capabilities than the boot processor are not handled.  -   In this case boot with "noreplacement". */  -void apply_alternatives(void *start, void *end)  -{  -	struct alt_instr *a;  -	int diff, i, k; -	for (a = start; (void *)a < end; a++) {  -		u8 *instr; - -		if (!boot_cpu_has(a->cpuid)) -			continue; - -		BUG_ON(a->replacementlen > a->instrlen);  -		instr = a->instr; -		/* vsyscall code is not mapped yet. resolve it manually. */ -		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) -			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); -		__inline_memcpy(instr, a->replacement, a->replacementlen); -		diff = a->instrlen - a->replacementlen;  - -		/* Pad the rest with nops */ -		for (i = a->replacementlen; diff > 0; diff -= k, i += k) { -			k = diff; -			if (k > ASM_NOP_MAX) -				k = ASM_NOP_MAX; -			__inline_memcpy(instr + i, k8_nops[k], k); -		}  -	} -}  - -static int no_replacement __initdata = 0;  -  -void __init alternative_instructions(void) -{ -	extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -	if (no_replacement)  -		return; -	apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{  -     no_replacement = 1;  -     return 1; -}  - -__setup("noreplacement", noreplacement_setup);  -  #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)  struct edd edd;  #ifdef CONFIG_EDD_MODULE @@ -779,10 +704,6 @@ void __init setup_arch(char **cmdline_p)  	e820_setup_gap(); -#ifdef CONFIG_GART_IOMMU -	iommu_hole_init(); -#endif -  #ifdef CONFIG_VT  #if defined(CONFIG_VGA_CONSOLE)  	conswitchp = &vga_con; @@ -867,24 +788,32 @@ static int nearby_node(int apicid)  static void __init amd_detect_cmp(struct cpuinfo_x86 *c)  {  #ifdef CONFIG_SMP -	int cpu = smp_processor_id();  	unsigned bits;  #ifdef CONFIG_NUMA +	int cpu = smp_processor_id();  	int node = 0;  	unsigned apicid = hard_smp_processor_id();  #endif +	unsigned ecx = cpuid_ecx(0x80000008); + +	c->x86_max_cores = (ecx & 0xff) + 1; -	bits = 0; -	while ((1 << bits) < c->x86_max_cores) -		bits++; +	/* CPU telling us the core id bits shift? */ +	bits = (ecx >> 12) & 0xF; + +	/* Otherwise recompute */ +	if (bits == 0) { +		while ((1 << bits) < c->x86_max_cores) +			bits++; +	}  	/* Low order bits define the core id (index of core in socket) */ -	cpu_core_id[cpu] = phys_proc_id[cpu] & ((1 << bits)-1); +	c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);  	/* Convert the APIC ID into the socket ID */ -	phys_proc_id[cpu] = phys_pkg_id(bits); +	c->phys_proc_id = phys_pkg_id(bits);  #ifdef CONFIG_NUMA -  	node = phys_proc_id[cpu]; +  	node = c->phys_proc_id;   	if (apicid_to_node[apicid] != NUMA_NO_NODE)   		node = apicid_to_node[apicid];   	if (!node_online(node)) { @@ -897,7 +826,7 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)   		   but in the same order as the HT nodeids.   		   If that doesn't result in a usable node fall back to the   		   path for the previous case.  */ - 		int ht_nodeid = apicid - (phys_proc_id[0] << bits); + 		int ht_nodeid = apicid - (cpu_data[0].phys_proc_id << bits);   		if (ht_nodeid >= 0 &&   		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)   			node = apicid_to_node[ht_nodeid]; @@ -907,15 +836,13 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c)   	}  	numa_set_node(cpu, node); -  	printk(KERN_INFO "CPU %d/%x(%d) -> Node %d -> Core %d\n", -  			cpu, apicid, c->x86_max_cores, node, cpu_core_id[cpu]); +	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);  #endif  #endif  } -static int __init init_amd(struct cpuinfo_x86 *c) +static void __init init_amd(struct cpuinfo_x86 *c)  { -	int r;  	unsigned level;  #ifdef CONFIG_SMP @@ -948,8 +875,8 @@ static int __init init_amd(struct cpuinfo_x86 *c)  	if (c->x86 >= 6)  		set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); -	r = get_model_name(c); -	if (!r) {  +	level = get_model_name(c); +	if (!level) {  		switch (c->x86) {   		case 15:  			/* Should distinguish Models here, but this is only @@ -964,13 +891,12 @@ static int __init init_amd(struct cpuinfo_x86 *c)  	if (c->x86_power & (1<<8))  		set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); -	if (c->extended_cpuid_level >= 0x80000008) { -		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - +	/* Multi core CPU? */ +	if (c->extended_cpuid_level >= 0x80000008)  		amd_detect_cmp(c); -	} -	return r; +	/* Fix cpuid4 emulation for more */ +	num_cache_leaves = 3;  }  static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -978,13 +904,14 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)  #ifdef CONFIG_SMP  	u32 	eax, ebx, ecx, edx;  	int 	index_msb, core_bits; -	int 	cpu = smp_processor_id();  	cpuid(1, &eax, &ebx, &ecx, &edx); -	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) +	if (!cpu_has(c, X86_FEATURE_HT))  		return; + 	if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) +		goto out;  	smp_num_siblings = (ebx & 0xff0000) >> 16; @@ -999,10 +926,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)  		}  		index_msb = get_count_order(smp_num_siblings); -		phys_proc_id[cpu] = phys_pkg_id(index_msb); - -		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", -		       phys_proc_id[cpu]); +		c->phys_proc_id = phys_pkg_id(index_msb);  		smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -1010,13 +934,15 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)  		core_bits = get_count_order(c->x86_max_cores); -		cpu_core_id[cpu] = phys_pkg_id(index_msb) & +		c->cpu_core_id = phys_pkg_id(index_msb) &  					       ((1 << core_bits) - 1); - -		if (c->x86_max_cores > 1) -			printk(KERN_INFO  "CPU: Processor Core ID: %d\n", -			       cpu_core_id[cpu]);  	} +out: +	if ((c->x86_max_cores * smp_num_siblings) > 1) { +		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n", c->phys_proc_id); +		printk(KERN_INFO  "CPU: Processor Core ID: %d\n", c->cpu_core_id); +	} +  #endif  } @@ -1025,15 +951,12 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)   */  static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)  { -	unsigned int eax; +	unsigned int eax, t;  	if (c->cpuid_level < 4)  		return 1; -	__asm__("cpuid" -		: "=a" (eax) -		: "0" (4), "c" (0) -		: "bx", "dx"); +	cpuid_count(4, 0, &eax, &t, &t, &t);  	if (eax & 0x1f)  		return ((eax >> 26) + 1); @@ -1046,16 +969,17 @@ static void srat_detect_node(void)  #ifdef CONFIG_NUMA  	unsigned node;  	int cpu = smp_processor_id(); +	int apicid = hard_smp_processor_id();  	/* Don't do the funky fallback heuristics the AMD version employs  	   for now. */ -	node = apicid_to_node[hard_smp_processor_id()]; +	node = apicid_to_node[apicid];  	if (node == NUMA_NO_NODE)  		node = first_node(node_online_map);  	numa_set_node(cpu, node);  	if (acpi_numa > 0) -		printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); +		printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);  #endif  } @@ -1065,6 +989,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  	unsigned n;  	init_intel_cacheinfo(c); +	if (c->cpuid_level > 9 ) { +		unsigned eax = cpuid_eax(10); +		/* Check for version and the number of counters */ +		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) +			set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); +	} +  	n = c->extended_cpuid_level;  	if (n >= 0x80000008) {  		unsigned eax = cpuid_eax(0x80000008); @@ -1156,7 +1087,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)  	}  #ifdef CONFIG_SMP -	phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; +	c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;  #endif  } @@ -1283,7 +1214,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)  		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,  		NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,  		NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, -		NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", +		NULL, "fxsr_opt", NULL, "rdtscp", NULL, "lm", "3dnowext", "3dnow",  		/* Transmeta-defined */  		"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, @@ -1294,7 +1225,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)  		/* Other (Linux-defined) */  		"cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL,  		"constant_tsc", NULL, NULL, -		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +		"up", NULL, NULL, NULL, NULL, NULL, NULL, NULL,  		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,  		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1364,9 +1295,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)  #ifdef CONFIG_SMP  	if (smp_num_siblings * c->x86_max_cores > 1) {  		int cpu = c - cpu_data; -		seq_printf(m, "physical id\t: %d\n", phys_proc_id[cpu]); +		seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);  		seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[cpu])); -		seq_printf(m, "core id\t\t: %d\n", cpu_core_id[cpu]); +		seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);  		seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);  	}  #endif	 @@ -1440,7 +1371,7 @@ struct seq_operations cpuinfo_op = {  	.show =	show_cpuinfo,  }; -#ifdef CONFIG_INPUT_PCSPKR +#if defined(CONFIG_INPUT_PCSPKR) || defined(CONFIG_INPUT_PCSPKR_MODULE)  #include <linux/platform_device.h>  static __init int add_pcspkr(void)  { diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 8a691fa6d393..f5934cb4a2b6 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -3,7 +3,6 @@   * Copyright (C) 1995  Linus Torvalds   * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen.   * See setup.c for older changelog. - * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $   */   #include <linux/config.h>  #include <linux/init.h> @@ -31,6 +30,7 @@ char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};  cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;  struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(_cpu_pda);  struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;  struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; @@ -38,6 +38,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };  char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));  unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL(__supported_pte_mask);  static int do_not_nx __cpuinitdata = 0;  /* noexec=on|off diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index e5f5ce7909a3..28161170fb0a 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -7,8 +7,6 @@   *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes   *  2000-2002   x86-64 support by Andi Kleen - *  - *  $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $   */  #include <linux/sched.h> @@ -239,7 +237,6 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)  	rsp = regs->rsp - 128;  	/* This is the X/Open sanctioned signal stack switching.  */ -	/* RED-PEN: redzone on that stack? */  	if (ka->sa.sa_flags & SA_ONSTACK) {  		if (sas_ss_flags(rsp) == 0)  			rsp = current->sas_ss_sp + current->sas_ss_size; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index e369a3426df4..acee4bc3f6fa 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -224,6 +224,7 @@ void flush_tlb_current_task(void)  		flush_tlb_others(cpu_mask, mm, FLUSH_ALL);  	preempt_enable();  } +EXPORT_SYMBOL(flush_tlb_current_task);  void flush_tlb_mm (struct mm_struct * mm)  { @@ -244,6 +245,7 @@ void flush_tlb_mm (struct mm_struct * mm)  	preempt_enable();  } +EXPORT_SYMBOL(flush_tlb_mm);  void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)  { @@ -266,6 +268,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)  	preempt_enable();  } +EXPORT_SYMBOL(flush_tlb_page);  static void do_flush_tlb_all(void* info)  { @@ -443,6 +446,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,  	spin_unlock(&call_lock);  	return 0;  } +EXPORT_SYMBOL(smp_call_function);  void smp_stop_cpu(void)  { @@ -460,7 +464,7 @@ static void smp_really_stop_cpu(void *dummy)  {  	smp_stop_cpu();   	for (;;)  -		asm("hlt");  +		halt();  }   void smp_send_stop(void) @@ -520,13 +524,13 @@ asmlinkage void smp_call_function_interrupt(void)  int safe_smp_processor_id(void)  { -	int apicid, i; +	unsigned apicid, i;  	if (disable_apic)  		return 0;  	apicid = hard_smp_processor_id(); -	if (x86_cpu_to_apicid[apicid] == apicid) +	if (apicid < NR_CPUS && x86_cpu_to_apicid[apicid] == apicid)  		return apicid;  	for (i = 0; i < NR_CPUS; ++i) { diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 71a7222cf9ce..4e9755179ecf 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -63,13 +63,11 @@  /* Number of siblings per CPU package */  int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; -/* core ID of each logical CPU */ -u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(smp_num_siblings);  /* Last level cache ID of each logical CPU */  u8 cpu_llc_id[NR_CPUS] __cpuinitdata  = {[0 ... NR_CPUS-1] = BAD_APICID}; +EXPORT_SYMBOL(cpu_llc_id);  /* Bitmask of currently online CPUs */  cpumask_t cpu_online_map __read_mostly; @@ -82,18 +80,21 @@ EXPORT_SYMBOL(cpu_online_map);   */  cpumask_t cpu_callin_map;  cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map);  cpumask_t cpu_possible_map;  EXPORT_SYMBOL(cpu_possible_map);  /* Per CPU bogomips and other parameters */  struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data);  /* Set when the idlers are all forked */  int smp_threads_ready;  /* representing HT siblings of each logical CPU */  cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map);  /* representing HT and core siblings of each logical CPU */  cpumask_t cpu_core_map[NR_CPUS] __read_mostly; @@ -472,8 +473,8 @@ static inline void set_cpu_sibling_map(int cpu)  	if (smp_num_siblings > 1) {  		for_each_cpu_mask(i, cpu_sibling_setup_map) { -			if (phys_proc_id[cpu] == phys_proc_id[i] && -			    cpu_core_id[cpu] == cpu_core_id[i]) { +			if (c[cpu].phys_proc_id == c[i].phys_proc_id && +			    c[cpu].cpu_core_id == c[i].cpu_core_id) {  				cpu_set(i, cpu_sibling_map[cpu]);  				cpu_set(cpu, cpu_sibling_map[i]);  				cpu_set(i, cpu_core_map[cpu]); @@ -500,7 +501,7 @@ static inline void set_cpu_sibling_map(int cpu)  			cpu_set(i, c[cpu].llc_shared_map);  			cpu_set(cpu, c[i].llc_shared_map);  		} -		if (phys_proc_id[cpu] == phys_proc_id[i]) { +		if (c[cpu].phys_proc_id == c[i].phys_proc_id) {  			cpu_set(i, cpu_core_map[cpu]);  			cpu_set(cpu, cpu_core_map[i]);  			/* @@ -797,6 +798,8 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid)  	} +	alternatives_smp_switch(1); +  	c_idle.idle = get_idle_for_cpu(cpu);  	if (c_idle.idle) { @@ -1199,8 +1202,8 @@ static void remove_siblinginfo(int cpu)  		cpu_clear(cpu, cpu_sibling_map[sibling]);  	cpus_clear(cpu_sibling_map[cpu]);  	cpus_clear(cpu_core_map[cpu]); -	phys_proc_id[cpu] = BAD_APICID; -	cpu_core_id[cpu] = BAD_APICID; +	c[cpu].phys_proc_id = 0; +	c[cpu].cpu_core_id = 0;  	cpu_clear(cpu, cpu_sibling_setup_map);  } @@ -1259,6 +1262,8 @@ void __cpu_die(unsigned int cpu)  		/* They ack this in play_dead by setting CPU_DEAD */  		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {  			printk ("CPU %d is now offline\n", cpu); +			if (1 == num_online_cpus()) +				alternatives_smp_switch(0);  			return;  		}  		msleep(100); diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c new file mode 100644 index 000000000000..8d4c67f61b8e --- /dev/null +++ b/arch/x86_64/kernel/tce.c @@ -0,0 +1,202 @@ +/* + * Derived from arch/powerpc/platforms/pseries/iommu.c + * + * Copyright (C) 2006 Jon Mason <jdmason@us.ibm.com>, IBM Corporation + * Copyright (C) 2006 Muli Ben-Yehuda <muli@il.ibm.com>, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/bootmem.h> +#include <asm/tce.h> +#include <asm/calgary.h> +#include <asm/proto.h> + +/* flush a tce at 'tceaddr' to main memory */ +static inline void flush_tce(void* tceaddr) +{ +	/* a single tce can't cross a cache line */ +	if (cpu_has_clflush) +		asm volatile("clflush (%0)" :: "r" (tceaddr)); +	else +		asm volatile("wbinvd":::"memory"); +} + +void tce_build(struct iommu_table *tbl, unsigned long index, +	unsigned int npages, unsigned long uaddr, int direction) +{ +	u64* tp; +	u64 t; +	u64 rpn; + +	t = (1 << TCE_READ_SHIFT); +	if (direction != DMA_TO_DEVICE) +		t |= (1 << TCE_WRITE_SHIFT); + +	tp = ((u64*)tbl->it_base) + index; + +	while (npages--) { +		rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; +		t &= ~TCE_RPN_MASK; +		t |= (rpn << TCE_RPN_SHIFT); + +		*tp = cpu_to_be64(t); +		flush_tce(tp); + +		uaddr += PAGE_SIZE; +		tp++; +	} +} + +void tce_free(struct iommu_table *tbl, long index, unsigned int npages) +{ +	u64* tp; + +	tp  = ((u64*)tbl->it_base) + index; + +	while (npages--) { +		*tp = cpu_to_be64(0); +		flush_tce(tp); +		tp++; +	} +} + +static inline unsigned int table_size_to_number_of_entries(unsigned char size) +{ +	/* +	 * size is the order of the table, 0-7 +	 * smallest table is 8K entries, so shift result by 13 to +	 * multiply by 8K +	 */ +	return (1 << size) << 13; +} + +static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) +{ +	unsigned int bitmapsz; +	unsigned int tce_table_index; +	unsigned long bmppages; +	int ret; + +	tbl->it_busno = dev->bus->number; + +	/* set the tce table size - measured in entries */ +	tbl->it_size = table_size_to_number_of_entries(specified_table_size); + +	tce_table_index = bus_to_phb(tbl->it_busno); +	tbl->it_base = (unsigned long)tce_table_kva[tce_table_index]; +	if (!tbl->it_base) { +		printk(KERN_ERR "Calgary: iommu_table_setparms: " +		       "no table allocated?!\n"); +		ret = -ENOMEM; +		goto done; +	} + +	/* +	 * number of bytes needed for the bitmap size in number of +	 * entries; we need one bit per entry +	 */ +	bitmapsz = tbl->it_size / BITS_PER_BYTE; +	bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); +	if (!bmppages) { +		printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); +		ret = -ENOMEM; +		goto done; +	} + +	tbl->it_map = (unsigned long*)bmppages; + +	memset(tbl->it_map, 0, bitmapsz); + +	tbl->it_hint = 0; + +	spin_lock_init(&tbl->it_lock); + +	return 0; + +done: +	return ret; +} + +int build_tce_table(struct pci_dev *dev, void __iomem *bbar) +{ +	struct iommu_table *tbl; +	int ret; + +	if (dev->sysdata) { +		printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", +		       dev, dev->sysdata); +		BUG(); +	} + +	tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); +	if (!tbl) { +		printk(KERN_ERR "Calgary: error allocating iommu_table\n"); +		ret = -ENOMEM; +		goto done; +	} + +	ret = tce_table_setparms(dev, tbl); +	if (ret) +		goto free_tbl; + +	tce_free(tbl, 0, tbl->it_size); + +	tbl->bbar = bbar; + +	/* +	 * NUMA is already using the bus's sysdata pointer, so we use +	 * the bus's pci_dev's sysdata instead. +	 */ +	dev->sysdata = tbl; + +	return 0; + +free_tbl: +	kfree(tbl); +done: +	return ret; +} + +void* alloc_tce_table(void) +{ +	unsigned int size; + +	size = table_size_to_number_of_entries(specified_table_size); +	size *= TCE_ENTRY_SIZE; + +	return __alloc_bootmem_low(size, size, 0); +} + +void free_tce_table(void *tbl) +{ +	unsigned int size; + +	if (!tbl) +		return; + +	size = table_size_to_number_of_entries(specified_table_size); +	size *= TCE_ENTRY_SIZE; + +	free_bootmem(__pa(tbl), size); +} diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7392570f975d..ebbee6f59ff5 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -8,7 +8,7 @@   *  Copyright (c) 1995  Markus Kuhn   *  Copyright (c) 1996  Ingo Molnar   *  Copyright (c) 1998  Andrea Arcangeli - *  Copyright (c) 2002  Vojtech Pavlik + *  Copyright (c) 2002,2006  Vojtech Pavlik   *  Copyright (c) 2003  Andi Kleen   *  RTC support code taken from arch/i386/kernel/timers/time_hpet.c   */ @@ -51,14 +51,21 @@ extern int using_apic_timer;  static char *time_init_gtod(void);  DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock);  DEFINE_SPINLOCK(i8253_lock);  int nohpet __initdata = 0;  static int notsc __initdata = 0; -#undef HPET_HACK_ENABLE_DANGEROUS +#define USEC_PER_TICK (USEC_PER_SEC / HZ) +#define NSEC_PER_TICK (NSEC_PER_SEC / HZ) +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +#define NS_SCALE	10 /* 2^10, carefully chosen */ +#define US_SCALE	32 /* 2^32, arbitralrily chosen */  unsigned int cpu_khz;					/* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz);  static unsigned long hpet_period;			/* fsecs / HPET clock */  unsigned long hpet_tick;				/* HPET clocks / interrupt */  int hpet_use_timer;				/* Use counter of hpet for time keeping, otherwise PIT */ @@ -90,7 +97,7 @@ static inline unsigned int do_gettimeoffset_tsc(void)  	t = get_cycles_sync();  	if (t < vxtime.last_tsc)   		t = vxtime.last_tsc; /* hack */ -	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32; +	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE;  	return x;  } @@ -98,7 +105,7 @@ static inline unsigned int do_gettimeoffset_hpet(void)  {  	/* cap counter read to one tick to avoid inconsistencies */  	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; -	return (min(counter,hpet_tick) * vxtime.quot) >> 32; +	return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE;  }  unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; @@ -118,7 +125,7 @@ void do_gettimeofday(struct timeval *tv)  		seq = read_seqbegin(&xtime_lock);  		sec = xtime.tv_sec; -		usec = xtime.tv_nsec / 1000; +		usec = xtime.tv_nsec / NSEC_PER_USEC;  		/* i386 does some correction here to keep the clock   		   monotonous even when ntpd is fixing drift. @@ -129,14 +136,14 @@ void do_gettimeofday(struct timeval *tv)  		   in arch/x86_64/kernel/vsyscall.c and export all needed  		   variables in vmlinux.lds. -AK */  -		t = (jiffies - wall_jiffies) * (1000000L / HZ) + +		t = (jiffies - wall_jiffies) * USEC_PER_TICK +  			do_gettimeoffset();  		usec += t;  	} while (read_seqretry(&xtime_lock, seq)); -	tv->tv_sec = sec + usec / 1000000; -	tv->tv_usec = usec % 1000000; +	tv->tv_sec = sec + usec / USEC_PER_SEC; +	tv->tv_usec = usec % USEC_PER_SEC;  }  EXPORT_SYMBOL(do_gettimeofday); @@ -157,8 +164,8 @@ int do_settimeofday(struct timespec *tv)  	write_seqlock_irq(&xtime_lock); -	nsec -= do_gettimeoffset() * 1000 + -		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ); +	nsec -= do_gettimeoffset() * NSEC_PER_USEC + +		(jiffies - wall_jiffies) * NSEC_PER_TICK;  	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);  	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); @@ -288,7 +295,7 @@ unsigned long long monotonic_clock(void)  			this_offset = hpet_readl(HPET_COUNTER);  		} while (read_seqretry(&xtime_lock, seq));  		offset = (this_offset - last_offset); -		offset *= (NSEC_PER_SEC/HZ) / hpet_tick; +		offset *= NSEC_PER_TICK / hpet_tick;  	} else {  		do {  			seq = read_seqbegin(&xtime_lock); @@ -297,7 +304,8 @@ unsigned long long monotonic_clock(void)  			base = monotonic_base;  		} while (read_seqretry(&xtime_lock, seq));  		this_offset = get_cycles_sync(); -		offset = (this_offset - last_offset)*1000 / cpu_khz;  +		/* FIXME: 1000 or 1000000? */ +		offset = (this_offset - last_offset)*1000 / cpu_khz;  	}  	return base + offset;  } @@ -382,7 +390,7 @@ void main_timer_handler(struct pt_regs *regs)  		}  		monotonic_base +=  -			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick; +			(offset - vxtime.last) * NSEC_PER_TICK / hpet_tick;  		vxtime.last = offset;  #ifdef CONFIG_X86_PM_TIMER @@ -391,24 +399,25 @@ void main_timer_handler(struct pt_regs *regs)  #endif  	} else {  		offset = (((tsc - vxtime.last_tsc) * -			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ); +			   vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK;  		if (offset < 0)  			offset = 0; -		if (offset > (USEC_PER_SEC / HZ)) { -			lost = offset / (USEC_PER_SEC / HZ); -			offset %= (USEC_PER_SEC / HZ); +		if (offset > USEC_PER_TICK) { +			lost = offset / USEC_PER_TICK; +			offset %= USEC_PER_TICK;  		} -		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ; +		/* FIXME: 1000 or 1000000? */ +		monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz;  		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;  		if ((((tsc - vxtime.last_tsc) * -		      vxtime.tsc_quot) >> 32) < offset) +		      vxtime.tsc_quot) >> US_SCALE) < offset)  			vxtime.last_tsc = tsc - -				(((long) offset << 32) / vxtime.tsc_quot) - 1; +				(((long) offset << US_SCALE) / vxtime.tsc_quot) - 1;  	}  	if (lost > 0) { @@ -468,16 +477,15 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)  }  static unsigned int cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */  static inline void set_cyc2ns_scale(unsigned long cpu_khz)  { -	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +	cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz;  }  static inline unsigned long long cycles_2_ns(unsigned long long cyc)  { -	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +	return (cyc * cyc2ns_scale) >> NS_SCALE;  }  unsigned long long sched_clock(void) @@ -490,7 +498,7 @@ unsigned long long sched_clock(void)             Disadvantage is a small drift between CPUs in some configurations,  	   but that should be tolerable. */  	if (__vxtime.mode == VXTIME_HPET) -		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> 32; +		return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE;  #endif  	/* Could do CPU core sync here. Opteron can execute rdtsc speculatively, @@ -633,7 +641,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,  		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);  		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -			vxtime.tsc_quot = (1000L << 32) / cpu_khz; +			vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;  	}  	set_cyc2ns_scale(cpu_khz_ref); @@ -789,8 +797,8 @@ static int hpet_timer_stop_set_go(unsigned long tick)  	if (hpet_use_timer) {  		hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |  		    HPET_TN_32BIT, HPET_T0_CFG); -		hpet_writel(hpet_tick, HPET_T0_CMP); -		hpet_writel(hpet_tick, HPET_T0_CMP); /* AK: why twice? */ +		hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ +		hpet_writel(hpet_tick, HPET_T0_CMP); /* period */  		cfg |= HPET_CFG_LEGACY;  	}  /* @@ -825,8 +833,7 @@ static int hpet_init(void)  	if (hpet_period < 100000 || hpet_period > 100000000)  		return -1; -	hpet_tick = (1000000000L * (USEC_PER_SEC / HZ) + hpet_period / 2) / -		hpet_period; +	hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period;  	hpet_use_timer = (id & HPET_ID_LEGSUP); @@ -890,18 +897,6 @@ void __init time_init(void)  	char *timename;  	char *gtod; -#ifdef HPET_HACK_ENABLE_DANGEROUS -        if (!vxtime.hpet_address) { -		printk(KERN_WARNING "time.c: WARNING: Enabling HPET base " -		       "manually!\n"); -                outl(0x800038a0, 0xcf8); -                outl(0xff000001, 0xcfc); -                outl(0x800038a0, 0xcf8); -                vxtime.hpet_address = inl(0xcfc) & 0xfffffffe; -		printk(KERN_WARNING "time.c: WARNING: Enabled HPET " -		       "at %#lx.\n", vxtime.hpet_address); -        } -#endif  	if (nohpet)  		vxtime.hpet_address = 0; @@ -912,7 +907,7 @@ void __init time_init(void)  	                        -xtime.tv_sec, -xtime.tv_nsec);  	if (!hpet_init()) -                vxtime_hz = (1000000000000000L + hpet_period / 2) / hpet_period; +                vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period;  	else  		vxtime.hpet_address = 0; @@ -941,8 +936,8 @@ void __init time_init(void)  	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod);  	printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n",  		cpu_khz / 1000, cpu_khz % 1000); -	vxtime.quot = (1000000L << 32) / vxtime_hz; -	vxtime.tsc_quot = (1000L << 32) / cpu_khz; +	vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; +	vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz;  	vxtime.last_tsc = get_cycles_sync();  	setup_irq(0, &irq0); @@ -956,10 +951,10 @@ void __init time_init(void)  __cpuinit int unsynchronized_tsc(void)  {  #ifdef CONFIG_SMP -	if (oem_force_hpet_timer()) +	if (apic_is_clustered_box())  		return 1;   	/* Intel systems are normally all synchronized. Exceptions - 	   are handled in the OEM check above. */ + 	   are handled in the check above. */   	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)   		return 0;  #endif diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index cea335e8746c..3d11a2fe45b7 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -6,8 +6,6 @@   *   *  Pentium III FXSR, SSE support   *	Gareth Hughes <gareth@valinux.com>, May 2000 - * - *  $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $   */  /* @@ -31,6 +29,7 @@  #include <linux/nmi.h>  #include <linux/kprobes.h>  #include <linux/kexec.h> +#include <linux/unwind.h>  #include <asm/system.h>  #include <asm/uaccess.h> @@ -41,7 +40,7 @@  #include <asm/i387.h>  #include <asm/kdebug.h>  #include <asm/processor.h> - +#include <asm/unwind.h>  #include <asm/smp.h>  #include <asm/pgalloc.h>  #include <asm/pda.h> @@ -71,6 +70,7 @@ asmlinkage void machine_check(void);  asmlinkage void spurious_interrupt_bug(void);  ATOMIC_NOTIFIER_HEAD(die_chain); +EXPORT_SYMBOL(die_chain);  int register_die_notifier(struct notifier_block *nb)  { @@ -107,7 +107,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)  	preempt_enable_no_resched();  } -static int kstack_depth_to_print = 10; +static int kstack_depth_to_print = 12; +static int call_trace = 1;  #ifdef CONFIG_KALLSYMS  #include <linux/kallsyms.h>  @@ -191,6 +192,25 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,  	return NULL;  } +static int show_trace_unwind(struct unwind_frame_info *info, void *context) +{ +	int i = 11, n = 0; + +	while (unwind(info) == 0 && UNW_PC(info)) { +		++n; +		if (i > 50) { +			printk("\n       "); +			i = 7; +		} else +			i += printk(" "); +		i += printk_address(UNW_PC(info)); +		if (arch_unw_user_mode(info)) +			break; +	} +	printk("\n"); +	return n; +} +  /*   * x86-64 can have upto three kernel stacks:    * process stack @@ -198,15 +218,39 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack   */ -void show_trace(unsigned long *stack) +void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack)  {  	const unsigned cpu = safe_smp_processor_id();  	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; -	int i; +	int i = 11;  	unsigned used = 0;  	printk("\nCall Trace:"); +	if (!tsk) +		tsk = current; + +	if (call_trace >= 0) { +		int unw_ret = 0; +		struct unwind_frame_info info; + +		if (regs) { +			if (unwind_init_frame_info(&info, tsk, regs) == 0) +				unw_ret = show_trace_unwind(&info, NULL); +		} else if (tsk == current) +			unw_ret = unwind_init_running(&info, show_trace_unwind, NULL); +		else { +			if (unwind_init_blocked(&info, tsk) == 0) +				unw_ret = show_trace_unwind(&info, NULL); +		} +		if (unw_ret > 0) { +			if (call_trace > 0) +				return; +			printk("Legacy call trace:"); +			i = 18; +		} +	} +  #define HANDLE_STACK(cond) \  	do while (cond) { \  		unsigned long addr = *stack++; \ @@ -229,7 +273,7 @@ void show_trace(unsigned long *stack)  		} \  	} while (0) -	for(i = 11; ; ) { +	for(; ; ) {  		const char *id;  		unsigned long *estack_end;  		estack_end = in_exception_stack(cpu, (unsigned long)stack, @@ -264,7 +308,7 @@ void show_trace(unsigned long *stack)  	printk("\n");  } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp)  {  	unsigned long *stack;  	int i; @@ -298,7 +342,12 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)  		printk("%016lx ", *stack++);  		touch_nmi_watchdog();  	} -	show_trace((unsigned long *)rsp); +	show_trace(tsk, regs, rsp); +} + +void show_stack(struct task_struct *tsk, unsigned long * rsp) +{ +	_show_stack(tsk, NULL, rsp);  }  /* @@ -307,7 +356,7 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp)  void dump_stack(void)  {  	unsigned long dummy; -	show_trace(&dummy); +	show_trace(NULL, NULL, &dummy);  }  EXPORT_SYMBOL(dump_stack); @@ -334,7 +383,7 @@ void show_registers(struct pt_regs *regs)  	if (in_kernel) {  		printk("Stack: "); -		show_stack(NULL, (unsigned long*)rsp); +		_show_stack(NULL, regs, (unsigned long*)rsp);  		printk("\nCode: ");  		if (regs->rip < PAGE_OFFSET) @@ -383,6 +432,7 @@ void out_of_line_bug(void)  {   	BUG();   }  +EXPORT_SYMBOL(out_of_line_bug);  #endif  static DEFINE_SPINLOCK(die_lock); @@ -1012,3 +1062,14 @@ static int __init kstack_setup(char *s)  }  __setup("kstack=", kstack_setup); +static int __init call_trace_setup(char *s) +{ +	if (strcmp(s, "old") == 0) +		call_trace = -1; +	else if (strcmp(s, "both") == 0) +		call_trace = 0; +	else if (strcmp(s, "new") == 0) +		call_trace = 1; +	return 1; +} +__setup("call_trace=", call_trace_setup); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index b81f473c4a19..1c6a5f322919 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -45,6 +45,15 @@ SECTIONS    RODATA +#ifdef CONFIG_STACK_UNWIND +  . = ALIGN(8); +  .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { +	__start_unwind = .; +  	*(.eh_frame) +	__end_unwind = .; +  } +#endif +  				/* Data */    .data : AT(ADDR(.data) - LOAD_OFFSET) {  	*(.data) @@ -131,6 +140,26 @@ SECTIONS  	*(.data.page_aligned)    } +  /* might get freed after init */ +  . = ALIGN(4096); +  __smp_alt_begin = .; +  __smp_alt_instructions = .; +  .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { +	*(.smp_altinstructions) +  } +  __smp_alt_instructions_end = .; +  . = ALIGN(8); +  __smp_locks = .; +  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { +	*(.smp_locks) +  } +  __smp_locks_end = .; +  .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { +	*(.smp_altinstr_replacement) +  } +  . = ALIGN(4096); +  __smp_alt_end = .; +    . = ALIGN(4096);		/* Init code and data */    __init_begin = .;    .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 9468fb20b0bc..f603037df162 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -107,7 +107,7 @@ static __always_inline long time_syscall(long *t)  int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)  { -	if (unlikely(!__sysctl_vsyscall)) +	if (!__sysctl_vsyscall)  		return gettimeofday(tv,tz);  	if (tv)  		do_vgettimeofday(tv); @@ -120,7 +120,7 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)   * unlikely */  time_t __vsyscall(1) vtime(time_t *t)  { -	if (unlikely(!__sysctl_vsyscall)) +	if (!__sysctl_vsyscall)  		return time_syscall(t);  	else if (t)  		*t = __xtime.tv_sec;		 diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 1def21c9f7cd..370952c4ff22 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -1,66 +1,21 @@ +/* Exports for assembly files. +   All C exports should go in the respective C files. */ +  #include <linux/config.h>  #include <linux/module.h>  #include <linux/smp.h> -#include <linux/user.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/tty.h>  #include <asm/semaphore.h>  #include <asm/processor.h> -#include <asm/i387.h>  #include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h>  #include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/kdebug.h> -#include <asm/unistd.h> -#include <asm/tlbflush.h> -#include <asm/kdebug.h> - -extern spinlock_t rtc_lock; -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -#endif - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -//EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap);  EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off);  EXPORT_SYMBOL(__down_failed);  EXPORT_SYMBOL(__down_failed_interruptible);  EXPORT_SYMBOL(__down_failed_trylock);  EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(ip_compute_csum); -/* Delay loops */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay);  EXPORT_SYMBOL(__get_user_1);  EXPORT_SYMBOL(__get_user_2); @@ -71,42 +26,20 @@ EXPORT_SYMBOL(__put_user_2);  EXPORT_SYMBOL(__put_user_4);  EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user);  EXPORT_SYMBOL(copy_user_generic);  EXPORT_SYMBOL(copy_from_user);  EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); -EXPORT_SYMBOL(strnlen_user); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif  EXPORT_SYMBOL(copy_page);  EXPORT_SYMBOL(clear_page); -EXPORT_SYMBOL(_cpu_pda);  #ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw));  EXPORT_SYMBOL(__write_lock_failed);  EXPORT_SYMBOL(__read_lock_failed); - -EXPORT_SYMBOL(smp_call_function); -EXPORT_SYMBOL(cpu_callout_map); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info);  #endif -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); -  /* Export string functions. We normally rely on gcc builtin for most of these,     but gcc sometimes decides not to inline them. */      #undef memcpy @@ -114,51 +47,14 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback);  #undef memmove  extern void * memset(void *,int,__kernel_size_t); -extern size_t strlen(const char *); -extern void * memmove(void * dest,const void *src,size_t count);  extern void * memcpy(void *,const void *,__kernel_size_t);  extern void * __memcpy(void *,const void *,__kernel_size_t);  EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memmove);  EXPORT_SYMBOL(memcpy);  EXPORT_SYMBOL(__memcpy); -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM -/* prototypes are wrong, these are assembly with custom calling functions */ -extern void rwsem_down_read_failed_thunk(void); -extern void rwsem_wake_thunk(void); -extern void rwsem_downgrade_thunk(void); -extern void rwsem_down_write_failed_thunk(void); -EXPORT_SYMBOL(rwsem_down_read_failed_thunk); -EXPORT_SYMBOL(rwsem_wake_thunk); -EXPORT_SYMBOL(rwsem_downgrade_thunk); -EXPORT_SYMBOL(rwsem_down_write_failed_thunk); -#endif -  EXPORT_SYMBOL(empty_zero_page); - -EXPORT_SYMBOL(die_chain); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_sibling_map); -EXPORT_SYMBOL(smp_num_siblings); -#endif - -#ifdef CONFIG_BUG -EXPORT_SYMBOL(out_of_line_bug); -#endif -  EXPORT_SYMBOL(init_level4_pgt); - -extern unsigned long __supported_pte_mask; -EXPORT_SYMBOL(__supported_pte_mask); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(flush_tlb_page); -#endif - -EXPORT_SYMBOL(cpu_khz); -  EXPORT_SYMBOL(load_gs_index); diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c index 5384e227cdf6..c493735218da 100644 --- a/arch/x86_64/lib/csum-partial.c +++ b/arch/x86_64/lib/csum-partial.c @@ -147,4 +147,5 @@ unsigned short ip_compute_csum(unsigned char * buff, int len)  {  	return csum_fold(csum_partial(buff,len,0));  } +EXPORT_SYMBOL(ip_compute_csum); diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c index 94323f20816e..b1320ec58428 100644 --- a/arch/x86_64/lib/csum-wrappers.c +++ b/arch/x86_64/lib/csum-wrappers.c @@ -109,6 +109,7 @@ csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len,  {   	return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);  }  +EXPORT_SYMBOL(csum_partial_copy_nocheck);  unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,  			       __u32 len, unsigned short proto, unsigned int sum)  diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 03c460cbdd1c..b6cd3cca2f45 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -9,6 +9,7 @@   */  #include <linux/config.h> +#include <linux/module.h>  #include <linux/sched.h>  #include <linux/delay.h>  #include <asm/delay.h> @@ -36,18 +37,22 @@ void __delay(unsigned long loops)  	}  	while((now-bclock) < loops);  } +EXPORT_SYMBOL(__delay);  inline void __const_udelay(unsigned long xloops)  {  	__delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32);  } +EXPORT_SYMBOL(__const_udelay);  void __udelay(unsigned long usecs)  {  	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */  } +EXPORT_SYMBOL(__udelay);  void __ndelay(unsigned long nsecs)  {  	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */  } +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c index e93d5255fdc9..751ebae8ec42 100644 --- a/arch/x86_64/lib/memmove.c +++ b/arch/x86_64/lib/memmove.c @@ -3,12 +3,13 @@   */  #define _STRING_C  #include <linux/string.h> +#include <linux/module.h>  #undef memmove  void *memmove(void * dest,const void *src,size_t count)  {  	if (dest < src) {  -		__inline_memcpy(dest,src,count); +		return memcpy(dest,src,count);  	} else {  		char *p = (char *) dest + count;  		char *s = (char *) src + count; @@ -17,3 +18,4 @@ void *memmove(void * dest,const void *src,size_t count)  	}  	return dest;  }  +EXPORT_SYMBOL(memmove); diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c index 9bc2c295818e..893d43f838cc 100644 --- a/arch/x86_64/lib/usercopy.c +++ b/arch/x86_64/lib/usercopy.c @@ -5,6 +5,7 @@   * Copyright 1997 Linus Torvalds   * Copyright 2002 Andi Kleen <ak@suse.de>   */ +#include <linux/module.h>  #include <asm/uaccess.h>  /* @@ -47,15 +48,17 @@ __strncpy_from_user(char *dst, const char __user *src, long count)  	__do_strncpy_from_user(dst, src, count, res);  	return res;  } +EXPORT_SYMBOL(__strncpy_from_user);  long  strncpy_from_user(char *dst, const char __user *src, long count)  {  	long res = -EFAULT;  	if (access_ok(VERIFY_READ, src, 1)) -		__do_strncpy_from_user(dst, src, count, res); +		return __strncpy_from_user(dst, src, count);  	return res;  } +EXPORT_SYMBOL(strncpy_from_user);  /*   * Zero Userspace @@ -94,7 +97,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)  		  [zero] "r" (0UL), [eight] "r" (8UL));  	return size;  } - +EXPORT_SYMBOL(__clear_user);  unsigned long clear_user(void __user *to, unsigned long n)  { @@ -102,6 +105,7 @@ unsigned long clear_user(void __user *to, unsigned long n)  		return __clear_user(to, n);  	return n;  } +EXPORT_SYMBOL(clear_user);  /*   * Return the size of a string (including the ending 0) @@ -125,6 +129,7 @@ long __strnlen_user(const char __user *s, long n)  		s++;  	}  } +EXPORT_SYMBOL(__strnlen_user);  long strnlen_user(const char __user *s, long n)  { @@ -132,6 +137,7 @@ long strnlen_user(const char __user *s, long n)  		return 0;  	return __strnlen_user(s, n);  } +EXPORT_SYMBOL(strnlen_user);  long strlen_user(const char __user *s)  { @@ -147,6 +153,7 @@ long strlen_user(const char __user *s)  		s++;  	}  } +EXPORT_SYMBOL(strlen_user);  unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)  { @@ -155,3 +162,5 @@ unsigned long copy_in_user(void __user *to, const void __user *from, unsigned le  	}   	return len;		  } +EXPORT_SYMBOL(copy_in_user); + diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 55250593d8c9..08dc696f54ee 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -41,6 +41,41 @@  #define PF_RSVD	(1<<3)  #define PF_INSTR	(1<<4) +#ifdef CONFIG_KPROBES +ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); + +/* Hook to register for page fault notifications */ +int register_page_fault_notifier(struct notifier_block *nb) +{ +	vmalloc_sync_all(); +	return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); +} + +int unregister_page_fault_notifier(struct notifier_block *nb) +{ +	return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); +} + +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	struct die_args args = { +		.regs = regs, +		.str = str, +		.err = err, +		.trapnr = trap, +		.signr = sig +	}; +	return atomic_notifier_call_chain(¬ify_page_fault_chain, val, &args); +} +#else +static inline int notify_page_fault(enum die_val val, const char *str, +			struct pt_regs *regs, long err, int trap, int sig) +{ +	return NOTIFY_DONE; +} +#endif +  void bust_spinlocks(int yes)  {  	int loglevel_save = console_loglevel; @@ -160,7 +195,7 @@ void dump_pagetable(unsigned long address)  	printk("PGD %lx ", pgd_val(*pgd));  	if (!pgd_present(*pgd)) goto ret;  -	pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); +	pud = pud_offset(pgd, address);  	if (bad_address(pud)) goto bad;  	printk("PUD %lx ", pud_val(*pud));  	if (!pud_present(*pud))	goto ret; @@ -348,7 +383,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,  			if (vmalloc_fault(address) >= 0)  				return;  		} -		if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, +		if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,  						SIGSEGV) == NOTIFY_STOP)  			return;  		/* @@ -358,7 +393,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,  		goto bad_area_nosemaphore;  	} -	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, +	if (notify_page_fault(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,  					SIGSEGV) == NOTIFY_STOP)  		return; @@ -410,8 +445,10 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,  	if (!(vma->vm_flags & VM_GROWSDOWN))  		goto bad_area;  	if (error_code & 4) { -		// XXX: align red zone size with ABI  -		if (address + 128 < regs->rsp) +		/* Allow userspace just enough access below the stack pointer +		 * to let the 'enter' instruction work. +		 */ +		if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp)  			goto bad_area;  	}  	if (expand_stack(vma, address)) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ba34e95d835..02add1d1dfa8 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -41,8 +41,6 @@  #include <asm/proto.h>  #include <asm/smp.h>  #include <asm/sections.h> -#include <asm/dma-mapping.h> -#include <asm/swiotlb.h>  #ifndef Dprintk  #define Dprintk(x...) @@ -90,8 +88,6 @@ void show_mem(void)  	printk(KERN_INFO "%lu pages swap cached\n",cached);  } -/* References to section boundaries */ -  int after_bootmem;  static __init void *spp_getpage(void) @@ -261,9 +257,10 @@ phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)  	for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {  		unsigned long entry; -		if (address > end) { -			for (; i < PTRS_PER_PMD; i++, pmd++) -				set_pmd(pmd, __pmd(0)); +		if (address >= end) { +			if (!after_bootmem) +				for (; i < PTRS_PER_PMD; i++, pmd++) +					set_pmd(pmd, __pmd(0));  			break;  		}  		entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; @@ -341,7 +338,8 @@ static void __init find_early_table_space(unsigned long end)  	table_end = table_start;  	early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", -		end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); +		end, table_start << PAGE_SHIFT, +		(table_start << PAGE_SHIFT) + tables);  }  /* Setup the direct mapping of the physical memory at PAGE_OFFSET. @@ -372,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)  		pud_t *pud;  		if (after_bootmem) -			pud = pud_offset_k(pgd, start & PGDIR_MASK); +			pud = pud_offset(pgd, start & PGDIR_MASK);  		else  			pud = alloc_low_page(&map, &pud_phys); @@ -587,10 +585,7 @@ void __init mem_init(void)  {  	long codesize, reservedpages, datasize, initsize; -#ifdef CONFIG_SWIOTLB -	pci_swiotlb_init(); -#endif -	no_iommu_init(); +	pci_iommu_alloc();  	/* How many end-of-memory variables you have, grandma! */  	max_low_pfn = end_pfn; @@ -644,20 +639,29 @@ void __init mem_init(void)  #endif  } -void free_initmem(void) +void free_init_pages(char *what, unsigned long begin, unsigned long end)  {  	unsigned long addr; -	addr = (unsigned long)(&__init_begin); -	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { +	if (begin >= end) +		return; + +	printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); +	for (addr = begin; addr < end; addr += PAGE_SIZE) {  		ClearPageReserved(virt_to_page(addr));  		init_page_count(virt_to_page(addr));  		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);   		free_page(addr);  		totalram_pages++;  	} +} + +void free_initmem(void) +{  	memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); -	printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10); +	free_init_pages("unused kernel memory", +			(unsigned long)(&__init_begin), +			(unsigned long)(&__init_end));  }  #ifdef CONFIG_DEBUG_RODATA @@ -686,15 +690,7 @@ void mark_rodata_ro(void)  #ifdef CONFIG_BLK_DEV_INITRD  void free_initrd_mem(unsigned long start, unsigned long end)  { -	if (start >= end) -		return; -	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); -	for (; start < end; start += PAGE_SIZE) { -		ClearPageReserved(virt_to_page(start)); -		init_page_count(virt_to_page(start)); -		free_page(start); -		totalram_pages++; -	} +	free_init_pages("initrd memory", start, end);  }  #endif diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index ae207064201e..45d7d823c3b8 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -11,6 +11,7 @@  #include <linux/vmalloc.h>  #include <linux/init.h>  #include <linux/slab.h> +#include <linux/module.h>  #include <asm/io.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> @@ -219,6 +220,7 @@ void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned l  	}  	return (__force void __iomem *) (offset + (char *)addr);  } +EXPORT_SYMBOL(__ioremap);  /**   * ioremap_nocache     -   map bus memory into CPU space @@ -246,6 +248,7 @@ void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)  {  	return __ioremap(phys_addr, size, _PAGE_PCD);  } +EXPORT_SYMBOL(ioremap_nocache);  /**   * iounmap - Free a IO remapping @@ -291,3 +294,5 @@ void iounmap(volatile void __iomem *addr)  	BUG_ON(p != o || o == NULL);  	kfree(p);   } +EXPORT_SYMBOL(iounmap); + diff --git a/arch/x86_64/pci/k8-bus.c b/arch/x86_64/pci/k8-bus.c index 3acf60ded2a0..b50a7c7c47f8 100644 --- a/arch/x86_64/pci/k8-bus.c +++ b/arch/x86_64/pci/k8-bus.c @@ -2,6 +2,7 @@  #include <linux/pci.h>  #include <asm/mpspec.h>  #include <linux/cpumask.h> +#include <asm/k8.h>  /*   * This discovers the pcibus <-> node mapping on AMD K8. @@ -18,7 +19,6 @@  #define NR_LDT_BUS_NUMBER_REGISTERS 3  #define SECONDARY_LDT_BUS_NUMBER(dword) ((dword >> 8) & 0xFF)  #define SUBORDINATE_LDT_BUS_NUMBER(dword) ((dword >> 16) & 0xFF) -#define PCI_DEVICE_ID_K8HTCONFIG 0x1100  /**   * fill_mp_bus_to_cpumask() @@ -28,8 +28,7 @@  __init static int  fill_mp_bus_to_cpumask(void)  { -	struct pci_dev *nb_dev = NULL; -	int i, j; +	int i, j, k;  	u32 ldtbus, nid;  	static int lbnr[3] = {  		LDT_BUS_NUMBER_REGISTER_0, @@ -37,8 +36,9 @@ fill_mp_bus_to_cpumask(void)  		LDT_BUS_NUMBER_REGISTER_2  	}; -	while ((nb_dev = pci_get_device(PCI_VENDOR_ID_AMD, -			PCI_DEVICE_ID_K8HTCONFIG, nb_dev))) { +	cache_k8_northbridges(); +	for (k = 0; k < num_k8_northbridges; k++) { +		struct pci_dev *nb_dev = k8_northbridges[k];  		pci_read_config_dword(nb_dev, NODE_ID_REGISTER, &nid);  		for (i = 0; i < NR_LDT_BUS_NUMBER_REGISTERS; i++) { diff --git a/arch/xtensa/Makefile b/arch/xtensa/Makefile index 98fac8489aed..3a3a4c66ef87 100644 --- a/arch/xtensa/Makefile +++ b/arch/xtensa/Makefile @@ -71,7 +71,7 @@ archprepare: $(archinc)/.platform  # Update machine cpu and platform symlinks if something which affects  # them changed. -$(archinc)/.platform: $(wildcard include/config/arch/*.h) include/config/MARKER +$(archinc)/.platform: $(wildcard include/config/arch/*.h) include/config/auto.conf  	@echo '  SYMLINK $(archinc)/xtensa/config -> $(archinc)/xtensa/config-$(CPU)'  	$(Q)mkdir -p $(archinc)  	$(Q)mkdir -p $(archinc)/xtensa  | 

