diff options
author | Jeff Garzik <jgarzik@pobox.com> | 2005-09-14 08:19:08 -0400 |
---|---|---|
committer | Jeff Garzik <jgarzik@pobox.com> | 2005-09-14 08:19:08 -0400 |
commit | 905ec87e93bc9e01b15c60035cd6a50c636cbaef (patch) | |
tree | 46fd7618d6511611ffc19eb0dd4d7bc6b90a41c2 /arch/x86_64/kernel | |
parent | 1d6ae775d7a948c9575658eb41184fd2e506c0df (diff) | |
parent | 2f4ba45a75d6383b4a1201169a808ffea416ffa0 (diff) | |
download | blackbird-op-linux-905ec87e93bc9e01b15c60035cd6a50c636cbaef.tar.gz blackbird-op-linux-905ec87e93bc9e01b15c60035cd6a50c636cbaef.zip |
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'arch/x86_64/kernel')
34 files changed, 577 insertions, 691 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index c32e198d7b2b..bcdd0a805fe7 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -12,7 +12,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ obj-$(CONFIG_X86_MCE) += mce.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ -obj-$(CONFIG_ACPI_BOOT) += acpi/ +obj-$(CONFIG_ACPI) += acpi/ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o @@ -46,3 +46,4 @@ microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o +msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o diff --git a/arch/x86_64/kernel/acpi/Makefile b/arch/x86_64/kernel/acpi/Makefile index d2c2ee5f9a88..7da9ace890bd 100644 --- a/arch/x86_64/kernel/acpi/Makefile +++ b/arch/x86_64/kernel/acpi/Makefile @@ -1,3 +1,3 @@ -obj-$(CONFIG_ACPI_BOOT) := boot.o -boot-$(CONFIG_ACPI_BOOT) := ../../../i386/kernel/acpi/boot.o +obj-y := boot.o +boot-y := ../../../i386/kernel/acpi/boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 7a275de6df22..867a0ebee177 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -34,7 +34,6 @@ #include <linux/slab.h> #include <linux/pci.h> #include <linux/bootmem.h> -#include <linux/irq.h> #include <linux/acpi.h> #include <asm/mpspec.h> #include <asm/io.h> @@ -47,7 +46,6 @@ #include <asm/proto.h> #include <asm/tlbflush.h> - /* -------------------------------------------------------------------------- Low-Level Sleep Support -------------------------------------------------------------------------- */ @@ -77,11 +75,12 @@ static void init_low_mapping(void) * Create an identity mapped page table and copy the wakeup routine to * low memory. */ -int acpi_save_state_mem (void) +int acpi_save_state_mem(void) { init_low_mapping(); - memcpy((void *) acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); return 0; @@ -90,7 +89,7 @@ int acpi_save_state_mem (void) /* * acpi_restore_state */ -void acpi_restore_state_mem (void) +void acpi_restore_state_mem(void) { set_pgd(pgd_offset(current->mm, 0UL), low_ptr); flush_tlb_all(); @@ -108,7 +107,8 @@ void __init acpi_reserve_bootmem(void) { acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) - printk(KERN_CRIT "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); + printk(KERN_CRIT + "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); } static int __init acpi_sleep_setup(char *str) @@ -127,6 +127,8 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); -#endif /*CONFIG_ACPI_SLEEP*/ +#endif /*CONFIG_ACPI_SLEEP */ -void acpi_pci_link_exit(void) {} +void acpi_pci_link_exit(void) +{ +} diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index c9a6b812e926..962ad4823b6a 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -245,6 +245,8 @@ void __init iommu_hole_init(void) if (aper_alloc) { /* Got the aperture from the AGP bridge */ + } else if (swiotlb && !valid_agp) { + /* Do nothing */ } else if ((!no_iommu && end_pfn >= 0xffffffff>>PAGE_SHIFT) || force_iommu || valid_agp || diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 375d369570ca..b6e7715d877f 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -18,7 +18,6 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> @@ -109,11 +108,8 @@ void clear_local_APIC(void) if (maxlvt >= 4) apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); } void __init connect_bsp_APIC(void) @@ -316,8 +312,6 @@ void __init init_bsp_APIC(void) */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } @@ -325,14 +319,6 @@ void __cpuinit setup_local_APIC (void) { unsigned int value, ver, maxlvt; - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } - value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); @@ -430,15 +416,11 @@ void __cpuinit setup_local_APIC (void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ + { unsigned oldvalue; maxlvt = get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); value = ERROR_APIC_VECTOR; // enables sending errors apic_write_around(APIC_LVTERR, value); @@ -452,17 +434,6 @@ void __cpuinit setup_local_APIC (void) apic_printk(APIC_VERBOSE, "ESR value after enabling vector: %08x, after %08x\n", oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - apic_printk(APIC_DEBUG, "Leaving ESR disabled.\n"); - else - apic_printk(APIC_DEBUG, "No ESR for 82489DX.\n"); } nmi_watchdog_default(); @@ -650,8 +621,7 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); #ifdef CONFIG_X86_IO_APIC { @@ -693,8 +663,6 @@ static void __setup_APIC_LVTT(unsigned int clocks) ver = GET_APIC_VERSION(apic_read(APIC_LVR)); lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); apic_write_around(APIC_LVTT, lvtt_value); /* @@ -1081,7 +1049,7 @@ int __init APIC_init_uniprocessor (void) connect_bsp_APIC(); - phys_cpu_present_map = physid_mask_of_physid(0); + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); apic_write_around(APIC_ID, boot_cpu_id); setup_local_APIC(); diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 35b4c3fcbb37..aaa6d3833517 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -39,7 +39,6 @@ int main(void) ENTRY(kernelstack); ENTRY(oldrsp); ENTRY(pcurrent); - ENTRY(irqrsp); ENTRY(irqcount); ENTRY(cpunumber); ENTRY(irqstackptr); diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index d7fa4248501c..535e04466079 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> -#include <linux/irq.h> #include <linux/reboot.h> #include <linux/kexec.h> diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 116ac5f53dce..eb7929eea7b3 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -131,7 +131,7 @@ void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned lon if (ei->type != E820_RAM || ei->addr+ei->size <= start || - ei->addr > end) + ei->addr >= end) continue; addr = round_up(ei->addr, PAGE_SIZE); @@ -567,7 +567,7 @@ unsigned long pci_mem_start = 0xaeedbabe; */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize; + unsigned long gapstart, gapsize, round; unsigned long last; int i; int found = 0; @@ -604,14 +604,14 @@ __init void e820_setup_gap(void) } /* - * Start allocating dynamic PCI memory a bit into the gap, - * aligned up to the nearest megabyte. - * - * Question: should we try to pad it up a bit (do something - * like " + (gapsize >> 3)" in there too?). We now have the - * technology. + * See how much we want to round up: start off with + * rounding to the next 1MB area. */ - pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", pci_mem_start, gapstart, gapsize); diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 9631c747c5e3..9cd968dd0f5a 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -5,6 +5,7 @@ #include <linux/tty.h> #include <asm/io.h> #include <asm/processor.h> +#include <asm/fcntl.h> /* Simple VGA output */ @@ -158,6 +159,47 @@ static struct console early_serial_console = { .index = -1, }; +/* Console interface to a host file on AMD's SimNow! */ + +static int simnow_fd; + +enum { + MAGIC1 = 0xBACCD00A, + MAGIC2 = 0xCA110000, + XOPEN = 5, + XWRITE = 4, +}; + +static noinline long simnow(long cmd, long a, long b, long c) +{ + long ret; + asm volatile("cpuid" : + "=a" (ret) : + "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); + return ret; +} + +void __init simnow_init(char *str) +{ + char *fn = "klog"; + if (*str == '=') + fn = ++str; + /* error ignored */ + simnow_fd = simnow(XOPEN, (unsigned long)fn, O_WRONLY|O_APPEND|O_CREAT, 0644); +} + +static void simnow_write(struct console *con, const char *s, unsigned n) +{ + simnow(XWRITE, simnow_fd, (unsigned long)s, n); +} + +static struct console simnow_console = { + .name = "simnow", + .write = simnow_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + /* Direct interface for emergencies */ struct console *early_console = &early_vga_console; static int early_console_initialized = 0; @@ -205,6 +247,10 @@ int __init setup_early_printk(char *opt) max_xpos = SCREEN_INFO.orig_video_cols; max_ypos = SCREEN_INFO.orig_video_lines; early_console = &early_vga_console; + } else if (!strncmp(buf, "simnow", 6)) { + simnow_init(buf + 6); + early_console = &simnow_console; + keep_early = 1; } early_console_initialized = 1; register_console(early_console); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index be51dbe1f75e..7937971d1853 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -36,7 +36,7 @@ #include <asm/errno.h> #include <asm/dwarf2.h> #include <asm/calling.h> -#include <asm/offset.h> +#include <asm/asm-offsets.h> #include <asm/msr.h> #include <asm/unistd.h> #include <asm/thread_info.h> @@ -79,16 +79,19 @@ xorl %eax, %eax pushq %rax /* ss */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET ss,0*/ pushq %rax /* rsp */ CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 + CFI_REL_OFFSET rsp,0 pushq $(1<<9) /* eflags - interrupts on */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET rflags,0*/ pushq $__KERNEL_CS /* cs */ CFI_ADJUST_CFA_OFFSET 8 + /*CFI_REL_OFFSET cs,0*/ pushq \child_rip /* rip */ CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 + CFI_REL_OFFSET rip,0 pushq %rax /* orig rax */ CFI_ADJUST_CFA_OFFSET 8 .endm @@ -98,32 +101,39 @@ CFI_ADJUST_CFA_OFFSET -(6*8) .endm - .macro CFI_DEFAULT_STACK - CFI_ADJUST_CFA_OFFSET (SS) - CFI_OFFSET r15,R15-SS - CFI_OFFSET r14,R14-SS - CFI_OFFSET r13,R13-SS - CFI_OFFSET r12,R12-SS - CFI_OFFSET rbp,RBP-SS - CFI_OFFSET rbx,RBX-SS - CFI_OFFSET r11,R11-SS - CFI_OFFSET r10,R10-SS - CFI_OFFSET r9,R9-SS - CFI_OFFSET r8,R8-SS - CFI_OFFSET rax,RAX-SS - CFI_OFFSET rcx,RCX-SS - CFI_OFFSET rdx,RDX-SS - CFI_OFFSET rsi,RSI-SS - CFI_OFFSET rdi,RDI-SS - CFI_OFFSET rsp,RSP-SS - CFI_OFFSET rip,RIP-SS + .macro CFI_DEFAULT_STACK start=1 + .if \start + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8 + .else + CFI_DEF_CFA_OFFSET SS+8 + .endif + CFI_REL_OFFSET r15,R15 + CFI_REL_OFFSET r14,R14 + CFI_REL_OFFSET r13,R13 + CFI_REL_OFFSET r12,R12 + CFI_REL_OFFSET rbp,RBP + CFI_REL_OFFSET rbx,RBX + CFI_REL_OFFSET r11,R11 + CFI_REL_OFFSET r10,R10 + CFI_REL_OFFSET r9,R9 + CFI_REL_OFFSET r8,R8 + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP + /*CFI_REL_OFFSET cs,CS*/ + /*CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP + /*CFI_REL_OFFSET ss,SS*/ .endm /* * A newly forked process directly context switches into this. */ /* rdi: prev */ ENTRY(ret_from_fork) - CFI_STARTPROC CFI_DEFAULT_STACK call schedule_tail GET_THREAD_INFO(%rcx) @@ -172,16 +182,21 @@ rff_trace: */ ENTRY(system_call) - CFI_STARTPROC + CFI_STARTPROC simple + CFI_DEF_CFA rsp,0 + CFI_REGISTER rip,rcx + /*CFI_REGISTER rflags,r11*/ swapgs movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp sti SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) + CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) + CFI_REMEMBER_STATE jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -201,9 +216,12 @@ sysret_check: cli movl threadinfo_flags(%rcx),%edx andl %edi,%edx + CFI_REMEMBER_STATE jnz sysret_careful movq RIP-ARGOFFSET(%rsp),%rcx + CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 + /*CFI_REGISTER rflags,r11*/ movq %gs:pda_oldrsp,%rsp swapgs sysretq @@ -211,12 +229,15 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: + CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 jmp sysret_check /* Handle a signal */ @@ -234,8 +255,13 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi jmp sysret_check +badsys: + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) + jmp ret_from_sys_call + /* Do syscall tracing */ tracesys: + CFI_RESTORE_STATE SAVE_REST movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi @@ -254,16 +280,29 @@ tracesys: RESTORE_TOP_OF_STACK %rbx RESTORE_REST jmp ret_from_sys_call + CFI_ENDPROC -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - /* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. */ -ENTRY(int_ret_from_sys_call) +ENTRY(int_ret_from_sys_call) + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-ARGOFFSET + /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + CFI_REL_OFFSET rsp,RSP-ARGOFFSET + /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ + /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ + CFI_REL_OFFSET rip,RIP-ARGOFFSET + CFI_REL_OFFSET rdx,RDX-ARGOFFSET + CFI_REL_OFFSET rcx,RCX-ARGOFFSET + CFI_REL_OFFSET rax,RAX-ARGOFFSET + CFI_REL_OFFSET rdi,RDI-ARGOFFSET + CFI_REL_OFFSET rsi,RSI-ARGOFFSET + CFI_REL_OFFSET r8,R8-ARGOFFSET + CFI_REL_OFFSET r9,R9-ARGOFFSET + CFI_REL_OFFSET r10,R10-ARGOFFSET + CFI_REL_OFFSET r11,R11-ARGOFFSET cli testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -284,8 +323,10 @@ int_careful: jnc int_very_careful sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 cli jmp int_with_check @@ -297,9 +338,11 @@ int_very_careful: testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx jz int_signal pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 leaq 8(%rsp),%rdi # &ptregs -> arg1 call syscall_trace_leave popq %rdi + CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi cli jmp int_restore_rest @@ -329,6 +372,8 @@ int_restore_rest: jmp ptregscall_common .endm + CFI_STARTPROC + PTREGSCALL stub_clone, sys_clone, %r8 PTREGSCALL stub_fork, sys_fork, %rdi PTREGSCALL stub_vfork, sys_vfork, %rdi @@ -337,40 +382,49 @@ int_restore_rest: PTREGSCALL stub_iopl, sys_iopl, %rsi ENTRY(ptregscall_common) - CFI_STARTPROC popq %r11 - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 SAVE_REST movq %r11, %r15 + CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call *%rax RESTORE_TOP_OF_STACK %r11 movq %r15, %r11 + CFI_REGISTER rip, r11 RESTORE_REST pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 ret CFI_ENDPROC ENTRY(stub_execve) CFI_STARTPROC popq %r11 - CFI_ADJUST_CFA_OFFSET -8 + CFI_ADJUST_CFA_OFFSET -8 + CFI_REGISTER rip, r11 SAVE_REST movq %r11, %r15 + CFI_REGISTER rip, r15 FIXUP_TOP_OF_STACK %r11 call sys_execve GET_THREAD_INFO(%rcx) bt $TIF_IA32,threadinfo_flags(%rcx) + CFI_REMEMBER_STATE jc exec_32bit RESTORE_TOP_OF_STACK %r11 movq %r15, %r11 + CFI_REGISTER rip, r11 RESTORE_REST - push %r11 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rip, 0 ret exec_32bit: - CFI_ADJUST_CFA_OFFSET REST_SKIP + CFI_RESTORE_STATE movq %rax,RAX(%rsp) RESTORE_REST jmp int_ret_from_sys_call @@ -382,7 +436,8 @@ exec_32bit: */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 SAVE_REST movq %rsp,%rdi FIXUP_TOP_OF_STACK %r11 @@ -392,6 +447,25 @@ ENTRY(stub_rt_sigreturn) jmp int_ret_from_sys_call CFI_ENDPROC +/* + * initial frame state for interrupts and exceptions + */ + .macro _frame ref + CFI_STARTPROC simple + CFI_DEF_CFA rsp,SS+8-\ref + /*CFI_REL_OFFSET ss,SS-\ref*/ + CFI_REL_OFFSET rsp,RSP-\ref + /*CFI_REL_OFFSET rflags,EFLAGS-\ref*/ + /*CFI_REL_OFFSET cs,CS-\ref*/ + CFI_REL_OFFSET rip,RIP-\ref + .endm + +/* initial frame state for interrupts (and exceptions without error code) */ +#define INTR_FRAME _frame RIP +/* initial frame state for exceptions with error code (and interrupts with + vector already pushed) */ +#define XCPT_FRAME _frame ORIG_RAX + /* * Interrupt entry/exit. * @@ -402,10 +476,6 @@ ENTRY(stub_rt_sigreturn) /* 0(%rsp): interrupt number */ .macro interrupt func - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) - CFI_REL_OFFSET rip,(RIP-ORIG_RAX) cld #ifdef CONFIG_DEBUG_INFO SAVE_ALL @@ -425,23 +495,27 @@ ENTRY(stub_rt_sigreturn) swapgs 1: incl %gs:pda_irqcount # RED-PEN should check preempt count movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp + cmoveq %rax,%rsp /*todo This needs CFI annotation! */ pushq %rdi # save old stack + CFI_ADJUST_CFA_OFFSET 8 call \func .endm ENTRY(common_interrupt) + XCPT_FRAME interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ -ret_from_intr: +ret_from_intr: popq %rdi + CFI_ADJUST_CFA_OFFSET -8 cli decl %gs:pda_irqcount #ifdef CONFIG_DEBUG_INFO movq RBP(%rdi),%rbp + CFI_DEF_CFA_REGISTER rsp #endif - leaq ARGOFFSET(%rdi),%rsp -exit_intr: + leaq ARGOFFSET(%rdi),%rsp /*todo This needs CFI annotation! */ +exit_intr: GET_THREAD_INFO(%rcx) testl $3,CS-ARGOFFSET(%rsp) je retint_kernel @@ -453,9 +527,10 @@ exit_intr: */ retint_with_reschedule: movl $_TIF_WORK_MASK,%edi -retint_check: +retint_check: movl threadinfo_flags(%rcx),%edx andl %edi,%edx + CFI_REMEMBER_STATE jnz retint_careful retint_swapgs: swapgs @@ -476,14 +551,17 @@ bad_iret: jmp do_exit .previous - /* edi: workmask, edx: work */ + /* edi: workmask, edx: work */ retint_careful: + CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal sti pushq %rdi + CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi + CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) cli jmp retint_check @@ -523,7 +601,9 @@ retint_kernel: * APIC interrupts. */ .macro apicinterrupt num,func + INTR_FRAME pushq $\num-256 + CFI_ADJUST_CFA_OFFSET 8 interrupt \func jmp ret_from_intr CFI_ENDPROC @@ -536,8 +616,19 @@ ENTRY(thermal_interrupt) ENTRY(reschedule_interrupt) apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt -ENTRY(invalidate_interrupt) - apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt + .macro INVALIDATE_ENTRY num +ENTRY(invalidate_interrupt\num) + apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt + .endm + + INVALIDATE_ENTRY 0 + INVALIDATE_ENTRY 1 + INVALIDATE_ENTRY 2 + INVALIDATE_ENTRY 3 + INVALIDATE_ENTRY 4 + INVALIDATE_ENTRY 5 + INVALIDATE_ENTRY 6 + INVALIDATE_ENTRY 7 ENTRY(call_function_interrupt) apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt @@ -558,16 +649,23 @@ ENTRY(spurious_interrupt) * Exception entry points. */ .macro zeroentry sym + INTR_FRAME pushq $0 /* push error code/oldrax */ + CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ + CFI_ADJUST_CFA_OFFSET 8 leaq \sym(%rip),%rax jmp error_entry + CFI_ENDPROC .endm .macro errorentry sym + XCPT_FRAME pushq %rax + CFI_ADJUST_CFA_OFFSET 8 leaq \sym(%rip),%rax jmp error_entry + CFI_ENDPROC .endm /* error code is on the stack already */ @@ -594,10 +692,7 @@ ENTRY(spurious_interrupt) * and the exception handler in %rax. */ ENTRY(error_entry) - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-RDI) - CFI_REL_OFFSET rip,(RIP-RDI) + _frame RDI /* rdi slot contains rax, oldrax contains error code */ cld subq $14*8,%rsp @@ -679,7 +774,9 @@ error_kernelspace: /* Reload gs selector with exception handling */ /* edi: new selector */ ENTRY(load_gs_index) + CFI_STARTPROC pushf + CFI_ADJUST_CFA_OFFSET 8 cli swapgs gs_change: @@ -687,7 +784,9 @@ gs_change: 2: mfence /* workaround */ swapgs popf + CFI_ADJUST_CFA_OFFSET -8 ret + CFI_ENDPROC .section __ex_table,"a" .align 8 @@ -799,7 +898,7 @@ ENTRY(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) - CFI_STARTPROC + INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug @@ -809,9 +908,9 @@ KPROBE_ENTRY(debug) /* runs on exception stack */ ENTRY(nmi) - CFI_STARTPROC + INTR_FRAME pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 + CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi /* * "Paranoid" exit path from exception stack. @@ -877,7 +976,7 @@ ENTRY(reserved) /* runs on exception stack */ ENTRY(double_fault) - CFI_STARTPROC + XCPT_FRAME paranoidentry do_double_fault jmp paranoid_exit CFI_ENDPROC @@ -890,7 +989,7 @@ ENTRY(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) - CFI_STARTPROC + XCPT_FRAME paranoidentry do_stack_segment jmp paranoid_exit CFI_ENDPROC @@ -911,7 +1010,7 @@ ENTRY(spurious_interrupt_bug) #ifdef CONFIG_X86_MCE /* runs on exception stack */ ENTRY(machine_check) - CFI_STARTPROC + INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check @@ -923,14 +1022,19 @@ ENTRY(call_debug) zeroentry do_call_debug ENTRY(call_softirq) + CFI_STARTPROC movq %gs:pda_irqstackptr,%rax pushq %r15 + CFI_ADJUST_CFA_OFFSET 8 movq %rsp,%r15 + CFI_DEF_CFA_REGISTER r15 incl %gs:pda_irqcount cmove %rax,%rsp call __do_softirq movq %r15,%rsp + CFI_DEF_CFA_REGISTER rsp decl %gs:pda_irqcount popq %r15 + CFI_ADJUST_CFA_OFFSET -8 ret - + CFI_ENDPROC diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index f062aa03bab7..7a64ea181788 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -20,7 +20,7 @@ #include <asm/smp.h> #include <asm/ipi.h> -#if defined(CONFIG_ACPI_BUS) +#if defined(CONFIG_ACPI) #include <acpi/acpi_bus.h> #endif @@ -45,9 +45,9 @@ void __init clustered_apic_check(void) u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int num_cpus = 0; + int max_apic = 0; -#if defined(CONFIG_ACPI_BUS) +#if defined(CONFIG_ACPI) /* * Some x86_64 machines use physical APIC mode regardless of how many * procs/clusters are present (x86_64 ES7000 is an example). @@ -64,14 +64,15 @@ void __init clustered_apic_check(void) id = bios_cpu_apicid[i]; if (id == BAD_APICID) continue; - num_cpus++; + if (id > max_apic) + max_apic = id; cluster_cnt[APIC_CLUSTERID(id)]++; } /* Don't use clustered mode on AMD platforms. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { genapic = &apic_physflat; -#ifndef CONFIG_CPU_HOTPLUG +#ifndef CONFIG_HOTPLUG_CPU /* In the CPU hotplug case we cannot use broadcast mode because that opens a race when a CPU is removed. Stay at physflat mode in this case. @@ -79,7 +80,7 @@ void __init clustered_apic_check(void) we have ACPI platform support for CPU hotplug we should detect hotplug capablity from ACPI tables and only do this when really needed. -AK */ - if (num_cpus <= 8) + if (max_apic <= 8) genapic = &apic_flat; #endif goto print; @@ -103,9 +104,14 @@ void __init clustered_apic_check(void) * (We don't use lowest priority delivery + HW APIC IRQ steering, so * can ignore the clustered logical case and go straight to physical.) */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { +#ifdef CONFIG_HOTPLUG_CPU + /* Don't use APIC shortcuts in CPU hotplug to avoid races */ + genapic = &apic_physflat; +#else genapic = &apic_flat; - else +#endif + } else genapic = &apic_cluster; print: diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c index f6523dd1bc09..a472d62f899a 100644 --- a/arch/x86_64/kernel/genapic_cluster.c +++ b/arch/x86_64/kernel/genapic_cluster.c @@ -51,10 +51,10 @@ static void cluster_init_apic_ldr(void) count = 3; id = my_cluster | (1UL << count); x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write_around(APIC_DFR, APIC_DFR_CLUSTER); + apic_write(APIC_DFR, APIC_DFR_CLUSTER); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); + apic_write(APIC_LDR, val); } /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index adc96282a9e2..9da3edb799ea 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -38,10 +38,10 @@ static void flat_init_apic_ldr(void) num = smp_processor_id(); id = 1UL << num; x86_cpu_to_log_apicid[num] = id; - apic_write_around(APIC_DFR, APIC_DFR_FLAT); + apic_write(APIC_DFR, APIC_DFR_FLAT); val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; val |= SET_APIC_LOGICAL_ID(id); - apic_write_around(APIC_LDR, val); + apic_write(APIC_LDR, val); } static void flat_send_IPI_mask(cpumask_t cpumask, int vector) @@ -62,7 +62,7 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -72,14 +72,24 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); local_irq_restore(flags); } static void flat_send_IPI_allbutself(int vector) { +#ifndef CONFIG_HOTPLUG_CPU if (((num_online_cpus()) - 1) >= 1) __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL); +#else + cpumask_t allbutme = cpu_online_map; + int me = get_cpu(); /* Ensure we are not preempted when we clear */ + cpu_clear(me, allbutme); + + if (!cpus_empty(allbutme)) + flat_send_IPI_mask(allbutme, vector); + put_cpu(); +#endif } static void flat_send_IPI_all(int vector) @@ -167,9 +177,9 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) struct genapic apic_physflat = { .name = "physical flat", - .int_delivery_mode = dest_LowestPrio, + .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_LOWEST, + .int_delivery_dest = APIC_DEST_PHYSICAL | APIC_DM_FIXED, .target_cpus = physflat_target_cpus, .apic_id_registered = flat_apic_id_registered, .init_apic_ldr = flat_init_apic_ldr,/*not needed, but shouldn't hurt*/ diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 98ff5eb32b9a..4592bf21fcaf 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -270,7 +270,7 @@ ENTRY(level3_kernel_pgt) .org 0x4000 ENTRY(level2_ident_pgt) /* 40MB for bootup. */ - .quad 0x0000000000000283 + .quad 0x0000000000000183 .quad 0x0000000000200183 .quad 0x0000000000400183 .quad 0x0000000000600183 diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index a89169095129..b2a238b5a17e 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -18,15 +18,12 @@ #include <asm/atomic.h> #include <asm/system.h> #include <asm/io.h> -#include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> -#include <linux/irq.h> - /* * Common place to define all x86 IRQ vectors * @@ -486,7 +483,14 @@ void spurious_interrupt(void); void error_interrupt(void); void reschedule_interrupt(void); void call_function_interrupt(void); -void invalidate_interrupt(void); +void invalidate_interrupt0(void); +void invalidate_interrupt1(void); +void invalidate_interrupt2(void); +void invalidate_interrupt3(void); +void invalidate_interrupt4(void); +void invalidate_interrupt5(void); +void invalidate_interrupt6(void); +void invalidate_interrupt7(void); void thermal_interrupt(void); void i8254_timer_resume(void); @@ -562,8 +566,15 @@ void __init init_IRQ(void) */ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + /* IPIs for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); + set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index c4dc91491157..e0ba5c1043fd 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -44,6 +44,6 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS; #define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index ba1a744e9bf0..522944a000ad 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -21,7 +21,6 @@ */ #include <linux/mm.h> -#include <linux/irq.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/delay.h> @@ -45,6 +44,8 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; +int disable_timer_pin_1 __initdata; + static DEFINE_SPINLOCK(ioapic_lock); /* @@ -298,6 +299,15 @@ void __init check_ioapic(void) #endif /* RED-PEN skip them on mptables too? */ return; + case PCI_VENDOR_ID_ATI: + /* All timer interrupts on atiixp + are doubled. Disable one. */ + if (disable_timer_pin_1 == 0) { + disable_timer_pin_1 = 1; + printk(KERN_INFO + "ATI board detected. Disabling timer pin 1.\n"); + } + return; } /* No multi-function device? */ @@ -1022,13 +1032,11 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); v = apic_read(APIC_EOI); printk(KERN_DEBUG "... APIC EOI: %08x\n", v); @@ -1048,12 +1056,8 @@ void __apicdebuginit print_local_APIC(void * dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); v = apic_read(APIC_ICR); printk(KERN_DEBUG "... APIC ICR: %08x\n", v); @@ -1167,9 +1171,9 @@ void disable_IO_APIC(void) clear_IO_APIC(); /* - * If the i82559 is routed through an IOAPIC + * If the i8259 is routed through an IOAPIC * Put that IOAPIC in virtual wire mode - * so legacy interrups can be delivered. + * so legacy interrupts can be delivered. */ pin = find_isa_irq_pin(0, mp_ExtINT); if (pin != -1) { @@ -1665,6 +1669,8 @@ static inline void check_timer(void) setup_nmi(); enable_8259A_irq(0); } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); return; } clear_IO_APIC_pin(0, pin1); @@ -1864,7 +1870,7 @@ device_initcall(ioapic_init_sysfs); ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI #define IO_APIC_MAX_ID 0xFE @@ -1947,7 +1953,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a return 0; } -#endif /*CONFIG_ACPI_BOOT*/ +#endif /* CONFIG_ACPI */ /* diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index 849a20aec7ca..d6a04a8320a3 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -99,7 +99,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) unsigned irq = regs->orig_rax & 0xff; irq_enter(); - BUG_ON(irq > 256); __do_IRQ(irq, regs); irq_exit(); diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8aa56736cde3..969365c0771b 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/cpu.h> #include <linux/percpu.h> +#include <linux/ctype.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -56,15 +57,19 @@ void mce_log(struct mce *mce) smp_wmb(); for (;;) { entry = rcu_dereference(mcelog.next); - /* When the buffer fills up discard new entries. Assume - that the earlier errors are the more interesting. */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); - return; + for (;;) { + /* When the buffer fills up discard new entries. Assume + that the earlier errors are the more interesting. */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, &mcelog.flags); + return; + } + /* Old left over entry. Skip. */ + if (mcelog.entry[entry].finished) { + entry++; + continue; + } } - /* Old left over entry. Skip. */ - if (mcelog.entry[entry].finished) - continue; smp_rmb(); next = entry + 1; if (cmpxchg(&mcelog.next, entry, next) == entry) @@ -404,9 +409,15 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff } err = 0; - for (i = 0; i < next; i++) { - if (!mcelog.entry[i].finished) - continue; + for (i = 0; i < next; i++) { + unsigned long start = jiffies; + while (!mcelog.entry[i].finished) { + if (!time_before(jiffies, start + 2)) { + memset(mcelog.entry + i,0, sizeof(struct mce)); + continue; + } + cpu_relax(); + } smp_rmb(); err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce)); buf += sizeof(struct mce); @@ -479,6 +490,7 @@ static int __init mcheck_disable(char *str) /* mce=off disables machine check. Note you can reenable it later using sysfs. + mce=TOLERANCELEVEL (number, see above) mce=bootlog Log MCEs from before booting. Disabled by default to work around buggy BIOS that leave bogus MCEs. */ static int __init mcheck_enable(char *str) @@ -489,6 +501,8 @@ static int __init mcheck_enable(char *str) mce_dont_init = 1; else if (!strcmp(str, "bootlog")) mce_bootlog = 1; + else if (isdigit(str[0])) + get_option(&str, &tolerant); else printk("mce= argument %s ignored. Please use /sys", str); return 0; @@ -501,10 +515,12 @@ __setup("mce", mcheck_enable); * Sysfs support */ -/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. */ +/* On resume clear all MCE state. Don't want to see leftovers from the BIOS. + Only one CPU is active at this time, the others get readded later using + CPU hotplug. */ static int mce_resume(struct sys_device *dev) { - on_each_cpu(mce_init, NULL, 1, 1); + mce_init(NULL); return 0; } diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 79c362d03e2e..f16d38d09daf 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -14,7 +14,6 @@ */ #include <linux/mm.h> -#include <linux/irq.h> #include <linux/init.h> #include <linux/delay.h> #include <linux/config.h> @@ -46,8 +45,6 @@ int acpi_found_madt; int apic_version [MAX_APICS]; unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -unsigned char pci_bus_to_node [256]; -EXPORT_SYMBOL(pci_bus_to_node); static int mp_current_pci_id = 0; /* I/O APIC entries */ @@ -74,7 +71,7 @@ static unsigned int num_processors = 0; physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; /* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI extern struct acpi_boot_flags acpi_boot; #ifdef CONFIG_X86_LOCAL_APIC extern int acpi_parse_lapic (acpi_table_entry_header *header); @@ -84,7 +81,7 @@ extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); #ifdef CONFIG_X86_IO_APIC extern int acpi_parse_ioapic (acpi_table_entry_header *header); #endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ +#endif /*CONFIG_ACPI*/ u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -519,8 +516,6 @@ void __init get_smp_config (void) struct intel_mp_floating *mpf = mpf_found; /* - * ACPI may be used to obtain the entire SMP configuration or just to - * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that * ACPI supports both logical (e.g. Hyper-Threading) and physical * processors, where MPS only supports physical. */ @@ -673,7 +668,7 @@ void __init find_smp_config (void) ACPI-based MP Configuration -------------------------------------------------------------------------- */ -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI void __init mp_register_lapic_address ( u64 address) @@ -707,7 +702,7 @@ void __init mp_register_lapic ( processor.mpc_type = MP_PROCESSOR; processor.mpc_apicid = id; - processor.mpc_apicver = 0x10; /* TBD: lapic version */ + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | @@ -929,11 +924,9 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; -#ifdef CONFIG_ACPI_BUS /* Don't set up the ACPI SCI because it's already set up */ if (acpi_fadt.sci_int == gsi) return gsi; -#endif ioapic = mp_find_ioapic(gsi); if (ioapic < 0) { @@ -973,13 +966,11 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) if (gsi < MAX_GSI_NUM) { if (gsi > 15) gsi = pci_irq++; -#ifdef CONFIG_ACPI_BUS /* * Don't assign IRQ used by ACPI SCI */ if (gsi == acpi_fadt.sci_int) gsi = pci_irq++; -#endif gsi_to_irq[irq] = gsi; } else { printk(KERN_ERR "GSI %u is too high\n", gsi); @@ -994,4 +985,4 @@ int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) } #endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ +#endif /*CONFIG_ACPI*/ diff --git a/arch/x86_64/kernel/msr.c b/arch/x86_64/kernel/msr.c deleted file mode 100644 index 598953ab0154..000000000000 --- a/arch/x86_64/kernel/msr.c +++ /dev/null @@ -1,279 +0,0 @@ -/* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, - * USA; either version 2 of the License, or (at your option) any later - * version; incorporated herein by reference. - * - * ----------------------------------------------------------------------- */ - -/* - * msr.c - * - * x86 MSR access device - * - * This device is accessed by lseek() to the appropriate register number - * and then read/write in chunks of 8 bytes. A larger size means multiple - * reads or writes of the same register. - * - * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on - * an SMP box will direct the access to CPU %d. - */ - -#include <linux/module.h> -#include <linux/config.h> - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/init.h> -#include <linux/poll.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/major.h> -#include <linux/fs.h> - -#include <asm/processor.h> -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/system.h> - -/* Note: "err" is handled in a funny way below. Otherwise one version - of gcc or another breaks. */ - -static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx) -{ - int err; - - asm volatile ("1: wrmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" " .quad 1b,3b\n" ".previous":"=&bDS" (err) - :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx) -{ - int err; - - asm volatile ("1: rdmsr\n" - "2:\n" - ".section .fixup,\"ax\"\n" - "3: movl %4,%0\n" - " jmp 2b\n" - ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,3b\n" - ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx) - :"c"(reg), "i"(-EIO), "0"(0)); - - return err; -} - -#ifdef CONFIG_SMP - -struct msr_command { - int cpu; - int err; - u32 reg; - u32 data[2]; -}; - -static void msr_smp_wrmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]); -} - -static void msr_smp_rdmsr(void *cmd_block) -{ - struct msr_command *cmd = (struct msr_command *)cmd_block; - - if (cmd->cpu == smp_processor_id()) - cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]); -} - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = wrmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - cmd.data[0] = eax; - cmd.data[1] = edx; - - smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx) -{ - struct msr_command cmd; - int ret; - - preempt_disable(); - if (cpu == smp_processor_id()) { - ret = rdmsr_eio(reg, eax, edx); - } else { - cmd.cpu = cpu; - cmd.reg = reg; - - smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); - - *eax = cmd.data[0]; - *edx = cmd.data[1]; - - ret = cmd.err; - } - preempt_enable(); - return ret; -} - -#else /* ! CONFIG_SMP */ - -static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) -{ - return wrmsr_eio(reg, eax, edx); -} - -static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) -{ - return rdmsr_eio(reg, eax, edx); -} - -#endif /* ! CONFIG_SMP */ - -static loff_t msr_seek(struct file *file, loff_t offset, int orig) -{ - loff_t ret = -EINVAL; - - lock_kernel(); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - } - unlock_kernel(); - return ret; -} - -static ssize_t msr_read(struct file *file, char __user * buf, - size_t count, loff_t * ppos) -{ - u32 __user *tmp = (u32 __user *) buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - err = do_rdmsr(cpu, reg, &data[0], &data[1]); - if (err) - return err; - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static ssize_t msr_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - const u32 __user *tmp = (const u32 __user *)buf; - u32 data[2]; - size_t rv; - u32 reg = *ppos; - int cpu = iminor(file->f_dentry->d_inode); - int err; - - if (count % 8) - return -EINVAL; /* Invalid chunk size */ - - for (rv = 0; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; - err = do_wrmsr(cpu, reg, data[0], data[1]); - if (err) - return err; - tmp += 2; - } - - return ((char __user *)tmp) - buf; -} - -static int msr_open(struct inode *inode, struct file *file) -{ - unsigned int cpu = iminor(file->f_dentry->d_inode); - struct cpuinfo_x86 *c = &(cpu_data)[cpu]; - - if (cpu >= NR_CPUS || !cpu_online(cpu)) - return -ENXIO; /* No such CPU */ - if (!cpu_has(c, X86_FEATURE_MSR)) - return -EIO; /* MSR not supported */ - - return 0; -} - -/* - * File operations we support - */ -static struct file_operations msr_fops = { - .owner = THIS_MODULE, - .llseek = msr_seek, - .read = msr_read, - .write = msr_write, - .open = msr_open, -}; - -static int __init msr_init(void) -{ - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { - printk(KERN_ERR "msr: unable to get major %d for msr\n", - MSR_MAJOR); - return -EBUSY; - } - - return 0; -} - -static void __exit msr_exit(void) -{ - unregister_chrdev(MSR_MAJOR, "cpu/msr"); -} - -module_init(msr_init); -module_exit(msr_exit) - -MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); -MODULE_DESCRIPTION("x86 generic MSR driver"); -MODULE_LICENSE("GPL"); diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index caf164959e19..4388b8a5bae7 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -14,7 +14,6 @@ #include <linux/config.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> @@ -488,8 +487,8 @@ void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason) == NOTIFY_STOP) { local_set(&__get_cpu_var(alert_counter), 0); return; - } - die_nmi("NMI Watchdog detected LOCKUP on CPU%d", regs); + } + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs); } } else { __get_cpu_var(last_irq_sum) = sum; diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 57f35c68aa34..cf0a0315d586 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -191,11 +191,9 @@ static void *dma_alloc_pages(struct device *dev, unsigned gfp, unsigned order) { struct page *page; int node; - if (dev->bus == &pci_bus_type) { - cpumask_t mask; - mask = pcibus_to_cpumask(to_pci_dev(dev)->bus); - node = cpu_to_node(first_cpu(mask)); - } else + if (dev->bus == &pci_bus_type) + node = pcibus_to_node(to_pci_dev(dev)->bus); + else node = numa_node_id(); page = alloc_pages_node(node, gfp, order); return page ? page_address(page) : NULL; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 8661f82ac70b..b5a89c0bdf59 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -32,7 +32,6 @@ #include <linux/a.out.h> #include <linux/interrupt.h> #include <linux/delay.h> -#include <linux/irq.h> #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/random.h> @@ -123,6 +122,7 @@ static void poll_idle (void) : : "i" (_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + clear_thread_flag(TIF_POLLING_NRFLAG); } else { set_need_resched(); } @@ -271,8 +271,11 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + system_utsname.release, + (int)strcspn(system_utsname.version, " "), + system_utsname.version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); @@ -483,33 +486,6 @@ out: } /* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = prev_p->thread_info; - next = next_p->thread_info; - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); - } -} - -/* * This special macro can be used to load a debugging register */ #define loaddebug(thread,r) set_debug(thread->debugreg ## r, r) @@ -627,8 +603,6 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct * } } - disable_tsc(prev_p, next_p); - return prev_p; } diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b356f8e6adfe..351d8d64c2fb 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -71,7 +71,7 @@ unsigned long mmu_cr4_features; int acpi_disabled; EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI extern int __initdata acpi_ht; extern acpi_interrupt_flags acpi_sci_flags; int __initdata acpi_force = 0; @@ -294,7 +294,7 @@ static __init void parse_cmdline_early (char ** cmdline_p) maxcpus = simple_strtoul(from + 8, NULL, 0); } #endif -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI /* "acpi=off" disables both ACPI table parsing and interpreter init */ if (!memcmp(from, "acpi=off", 8)) disable_acpi(); @@ -336,6 +336,11 @@ static __init void parse_cmdline_early (char ** cmdline_p) #endif #endif + if (!memcmp(from, "disable_timer_pin_1", 19)) + disable_timer_pin_1 = 1; + if (!memcmp(from, "enable_timer_pin_1", 18)) + disable_timer_pin_1 = -1; + if (!memcmp(from, "nolapic", 7) || !memcmp(from, "disableapic", 11)) disable_apic = 1; @@ -566,7 +571,7 @@ void __init setup_arch(char **cmdline_p) init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). * Call this early for SRAT node setup. @@ -658,7 +663,7 @@ void __init setup_arch(char **cmdline_p) check_ioapic(); -#ifdef CONFIG_ACPI_BOOT +#ifdef CONFIG_ACPI /* * Read APIC and some other early information from ACPI tables. */ @@ -755,6 +760,24 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) } } +#ifdef CONFIG_NUMA +static int nearby_node(int apicid) +{ + int i; + for (i = apicid - 1; i >= 0; i--) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + int node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif + /* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. @@ -763,8 +786,11 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP int cpu = smp_processor_id(); - int node = 0; unsigned bits; +#ifdef CONFIG_NUMA + int node = 0; + unsigned apicid = phys_proc_id[cpu]; +#endif bits = 0; while ((1 << bits) < c->x86_num_cores) @@ -776,20 +802,32 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) phys_proc_id[cpu] >>= bits; #ifdef CONFIG_NUMA - /* When an ACPI SRAT table is available use the mappings from SRAT - instead. */ - if (acpi_numa <= 0) { - node = phys_proc_id[cpu]; - if (!node_online(node)) - node = first_node(node_online_map); - cpu_to_node[cpu] = node; - } else { - node = cpu_to_node[cpu]; - } + node = phys_proc_id[cpu]; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + int ht_nodeid = apicid - (phys_proc_id[0] << bits); + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + cpu_to_node[cpu] = node; + + printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", + cpu, c->x86_num_cores, node, cpu_core_id[cpu]); #endif - - printk(KERN_INFO "CPU %d(%d) -> Node %d -> Core %d\n", - cpu, c->x86_num_cores, node, cpu_core_id[cpu]); #endif } @@ -909,6 +947,25 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } +static void srat_detect_node(void) +{ +#ifdef CONFIG_NUMA + unsigned apicid, node; + int cpu = smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + apicid = phys_proc_id[cpu]; + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE) + node = 0; + cpu_to_node[cpu] = node; + + if (acpi_numa > 0) + printk(KERN_INFO "CPU %d -> Node %d\n", cpu, node); +#endif +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -927,9 +984,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (c->x86 >= 15) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); c->x86_num_cores = intel_num_cpu_cores(c); + + srat_detect_node(); } -void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index e3ffcacc8c90..bd33be24a386 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -94,7 +94,7 @@ void __init setup_per_cpu_areas(void) size = PERCPU_ENOUGH_ROOM; #endif - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu_mask (i, cpu_possible_map) { char *ptr; if (!NODE_DATA(cpu_to_node(i))) { @@ -119,7 +119,6 @@ void pda_init(int cpu) asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); wrmsrl(MSR_GS_BASE, cpu_pda + cpu); - pda->me = pda; pda->cpunumber = cpu; pda->irqcount = -1; pda->kernelstack = diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index e5958220d6b8..9db9dda161b4 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -12,7 +12,6 @@ #include <linux/init.h> #include <linux/mm.h> -#include <linux/irq.h> #include <linux/delay.h> #include <linux/spinlock.h> #include <linux/smp_lock.h> @@ -29,6 +28,8 @@ #include <asm/proto.h> #include <asm/apicdef.h> +#define __cpuinit __init + /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -37,19 +38,41 @@ * writing to user space from interrupts. (Its not allowed anyway). * * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into + * the right per cpu variable for the flush data. + * + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. + * In future when interrupts are split into per CPU domains this could be + * fixed, at the cost of triggering multiple IPIs in some cases. */ -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +union smp_flush_state { + struct { + cpumask_t flush_cpumask; + struct mm_struct *flush_mm; + unsigned long flush_va; #define FLUSH_ALL -1ULL + spinlock_t tlbstate_lock; + }; + char pad[SMP_CACHE_BYTES]; +} ____cacheline_aligned; + +/* State is put into the per CPU data section, but padded + to a full cache line because other CPUs can access it and we don't + want false sharing in the per cpu data segment. */ +static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm (unsigned long cpu) +static inline void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); @@ -101,15 +124,25 @@ static inline void leave_mm (unsigned long cpu) * * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. + * + * Interrupts are disabled. */ -asmlinkage void smp_invalidate_interrupt (void) +asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) { - unsigned long cpu; + int cpu; + int sender; + union smp_flush_state *f; - cpu = get_cpu(); + cpu = smp_processor_id(); + /* + * orig_rax contains the interrupt vector - 256. + * Use that to determine where the sender put the data. + */ + sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START; + f = &per_cpu(flush_state, sender); - if (!cpu_isset(cpu, flush_cpumask)) + if (!cpu_isset(cpu, f->flush_cpumask)) goto out; /* * This was a BUG() but until someone can quote me the @@ -120,64 +153,63 @@ asmlinkage void smp_invalidate_interrupt (void) * BUG(); */ - if (flush_mm == read_pda(active_mm)) { + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (f->flush_va == FLUSH_ALL) local_flush_tlb(); else - __flush_tlb_one(flush_va); + __flush_tlb_one(f->flush_va); } else leave_mm(cpu); } out: ack_APIC_irq(); - cpu_clear(cpu, flush_cpumask); - put_cpu_no_resched(); + cpu_clear(cpu, f->flush_cpumask); } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; - /* - * A couple of (to be removed) sanity checks: - * - * - we do not send IPIs to not-yet booted CPUs. - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(tmp, cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - if (!mm) - BUG(); + int sender; + union smp_flush_state *f; - /* - * I'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); + /* Caller has disabled preemption */ + sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; + f = &per_cpu(flush_state, sender); + + /* Could avoid this lock when + num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + probably not worth checking this for a cache-hot lock. */ + spin_lock(&f->tlbstate_lock); + + f->flush_mm = mm; + f->flush_va = va; + cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); /* * We have to send the IPI only to * CPUs affected. */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); + send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); + + while (!cpus_empty(f->flush_cpumask)) + cpu_relax(); - while (!cpus_empty(flush_cpumask)) - mb(); /* nothing. lockup detection does not belong here */; + f->flush_mm = NULL; + f->flush_va = 0; + spin_unlock(&f->tlbstate_lock); +} - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); +int __cpuinit init_smp_flush(void) +{ + int i; + for_each_cpu_mask(i, cpu_possible_map) { + spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); + } + return 0; } + +core_initcall(init_smp_flush); void flush_tlb_current_task(void) { @@ -295,8 +327,11 @@ void unlock_ipi_call_lock(void) /* * this function sends a 'generic call function' IPI to one other CPU * in the system. + * + * cpu is a standard Linux logical CPU number. */ -static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, +static void +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, int nonatomic, int wait) { struct call_data_struct data; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 90aeccd15190..e12d7baeb33e 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -44,7 +44,6 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> #include <linux/smp_lock.h> -#include <linux/irq.h> #include <linux/bootmem.h> #include <linux/thread_info.h> #include <linux/module.h> @@ -58,6 +57,8 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/nmi.h> +#include <asm/irq.h> +#include <asm/hw_irq.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -413,8 +414,13 @@ void __cpuinit smp_callin(void) /* * Get our bogomips. + * + * Need to enable IRQs because it can take longer and then + * the NMI watchdog might kill us. */ + local_irq_enable(); calibrate_delay(); + local_irq_disable(); Dprintk("Stack at about %p\n",&cpuid); disable_APIC_timer(); @@ -540,8 +546,8 @@ static void inquire_remote_apic(int apicid) */ apic_wait_icr_idle(); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -574,12 +580,12 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); @@ -595,10 +601,10 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); timeout = 0; @@ -610,16 +616,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta atomic_set(&init_deasserted, 1); - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; + num_starts = 2; /* * Run STARTUP IPI loop. @@ -640,12 +637,11 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -864,17 +860,6 @@ static __cpuinit void smp_cleanup_boot(void) * Reset trampoline flag */ *((volatile int *) phys_to_virt(0x467)) = 0; - -#ifndef CONFIG_HOTPLUG_CPU - /* - * Free pages reserved for SMP bootup. - * When you add hotplug CPU support later remove this - * Note there is more work to be done for later CPU bootup. - */ - - free_page((unsigned long) __va(PAGE_SIZE)); - free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); -#endif } /* @@ -894,23 +879,6 @@ static __init void disable_smp(void) cpu_set(0, cpu_core_map[0]); } -/* - * Handle user cpus=... parameter. - */ -static __init void enforce_max_cpus(unsigned max_cpus) -{ - int i, k; - k = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_possible(i)) - continue; - if (++k > max_cpus) { - cpu_clear(i, cpu_possible_map); - cpu_clear(i, cpu_present_map); - } - } -} - #ifdef CONFIG_HOTPLUG_CPU /* * cpu_possible_map should be static, it cannot change as cpu's @@ -999,8 +967,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - enforce_max_cpus(max_cpus); - #ifdef CONFIG_HOTPLUG_CPU prefill_possible_map(); #endif diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 0612640d91b1..ebb9abf3ce6d 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -8,23 +8,8 @@ */ #include <linux/config.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/poll.h> -#include <linux/delay.h> -#include <linux/sysrq.h> -#include <linux/proc_fs.h> -#include <linux/irq.h> -#include <linux/pm.h> -#include <linux/device.h> +#include <linux/smp.h> #include <linux/suspend.h> -#include <asm/uaccess.h> -#include <asm/acpi.h> -#include <asm/tlbflush.h> -#include <asm/io.h> #include <asm/proto.h> struct saved_context saved_context; diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index 53f8e1659511..4d659e97df10 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -14,7 +14,7 @@ #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> -#include <asm/offset.h> +#include <asm/asm-offsets.h> ENTRY(swsusp_arch_suspend) diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 7b6abe058253..2373cb8b8625 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -18,7 +18,6 @@ #include <linux/interrupt.h> #include <linux/init.h> #include <linux/mc146818rtc.h> -#include <linux/irq.h> #include <linux/time.h> #include <linux/ioport.h> #include <linux/module.h> @@ -937,7 +936,6 @@ void __init time_init(void) vxtime.mode = VXTIME_TSC; vxtime.quot = (1000000L << 32) / vxtime_hz; vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.hz = vxtime_hz; rdtscll_sync(&vxtime.last_tsc); setup_irq(0, &irq0); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index f238d6078a5a..b5e09e6b5536 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -47,9 +47,6 @@ #include <asm/proto.h> #include <asm/nmi.h> -#include <linux/irq.h> - - extern struct gate_struct idt_table[256]; asmlinkage void divide_error(void); @@ -324,13 +321,13 @@ void handle_BUG(struct pt_regs *regs) if (__copy_from_user(&f, (struct bug_frame *) regs->rip, sizeof(struct bug_frame))) return; - if ((unsigned long)f.filename < __PAGE_OFFSET || + if (f.filename >= 0 || f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) return; - if (__get_user(tmp, f.filename)) - f.filename = "unmapped filename"; + if (__get_user(tmp, (char *)(long)f.filename)) + f.filename = (int)(long)"unmapped filename"; printk("----------- [cut here ] --------- [please bite here ] ---------\n"); - printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); + printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", (char *)(long)f.filename, f.line); } #ifdef CONFIG_BUG @@ -343,30 +340,33 @@ void out_of_line_bug(void) static DEFINE_SPINLOCK(die_lock); static int die_owner = -1; -void oops_begin(void) +unsigned long oops_begin(void) { - int cpu = safe_smp_processor_id(); - /* racy, but better than risking deadlock. */ - local_irq_disable(); + int cpu = safe_smp_processor_id(); + unsigned long flags; + + /* racy, but better than risking deadlock. */ + local_irq_save(flags); if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; else - spin_lock(&die_lock); + spin_lock(&die_lock); } - die_owner = cpu; + die_owner = cpu; console_verbose(); - bust_spinlocks(1); + bust_spinlocks(1); + return flags; } -void oops_end(void) +void oops_end(unsigned long flags) { die_owner = -1; - bust_spinlocks(0); - spin_unlock(&die_lock); + bust_spinlocks(0); + spin_unlock_irqrestore(&die_lock, flags); if (panic_on_oops) - panic("Oops"); -} + panic("Oops"); +} void __die(const char * str, struct pt_regs * regs, long err) { @@ -392,10 +392,11 @@ void __die(const char * str, struct pt_regs * regs, long err) void die(const char * str, struct pt_regs * regs, long err) { - oops_begin(); + unsigned long flags = oops_begin(); + handle_BUG(regs); __die(str, regs, err); - oops_end(); + oops_end(flags); do_exit(SIGSEGV); } static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) @@ -406,7 +407,8 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long e void die_nmi(char *str, struct pt_regs *regs) { - oops_begin(); + unsigned long flags = oops_begin(); + /* * We are in trouble anyway, lets at least try * to get a message out. @@ -416,7 +418,7 @@ void die_nmi(char *str, struct pt_regs *regs) if (panic_on_timeout || panic_on_oops) panic("nmi watchdog"); printk("console shuts up ...\n"); - oops_end(); + oops_end(flags); do_exit(SIGSEGV); } @@ -790,13 +792,16 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) */ cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { + switch (swd & ~cwd & 0x3f) { case 0x000: default: break; case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ info.si_code = FPE_FLTINV; break; case 0x002: /* Denormalize */ diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index d4abb07af52d..6dd642cad2ef 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -194,20 +194,7 @@ SECTIONS #endif } - /* DWARF 2 */ - .debug_info 0 : { *(.debug_info) } - .debug_abbrev 0 : { *(.debug_abbrev) } - .debug_line 0 : { *(.debug_line) } - .debug_frame 0 : { *(.debug_frame) } - .debug_str 0 : { *(.debug_str) } - .debug_loc 0 : { *(.debug_loc) } - .debug_macinfo 0 : { *(.debug_macinfo) } - /* SGI/MIPS DWARF 2 extensions */ - .debug_weaknames 0 : { *(.debug_weaknames) } - .debug_funcnames 0 : { *(.debug_funcnames) } - .debug_typenames 0 : { *(.debug_typenames) } - .debug_varnames 0 : { *(.debug_varnames) } - - - .comment 0 : { *(.comment) } + STABS_DEBUG + + DWARF_DEBUG } diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 2e5734425949..70a0bd16085f 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -107,7 +107,7 @@ static force_inline long time_syscall(long *t) return secs; } -static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { if (unlikely(!__sysctl_vsyscall)) return gettimeofday(tv,tz); @@ -120,7 +120,7 @@ static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz /* This will break when the xtime seconds get inaccurate, but that is * unlikely */ -static time_t __vsyscall(1) vtime(time_t *t) +time_t __vsyscall(1) vtime(time_t *t) { if (unlikely(!__sysctl_vsyscall)) return time_syscall(t); @@ -129,12 +129,12 @@ static time_t __vsyscall(1) vtime(time_t *t) return __xtime.tv_sec; } -static long __vsyscall(2) venosys_0(void) +long __vsyscall(2) venosys_0(void) { return -ENOSYS; } -static long __vsyscall(3) venosys_1(void) +long __vsyscall(3) venosys_1(void) { return -ENOSYS; } |