diff options
Diffstat (limited to 'arch/i386/kernel')
92 files changed, 4843 insertions, 4353 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 96fb8a020af2..1a884b6e6e5c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -4,13 +4,13 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ - quirks.o i8237.o topology.o alternative.o + quirks.o i8237.o topology.o alternative.o i8253.o tsc.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ -obj-y += timers/ obj-y += acpi/ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o @@ -37,6 +37,8 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_K8_NB) += k8.o EXTRA_AFLAGS := -traditional @@ -56,7 +58,8 @@ quiet_cmd_syscall = SYSCALL $@ export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) -vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 +vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) @@ -76,3 +79,7 @@ SYSCFLAGS_vsyscall-syms.o = -r $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \ $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE $(call if_changed,syscall) + +k8-y += ../../x86_64/kernel/k8.o +stacktrace-y += ../../x86_64/kernel/stacktrace.o + diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index 7e9ac99354f4..7f7be01f44e6 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -1,5 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o +ifneq ($(CONFIG_PCI),) obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o +endif obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o ifneq ($(CONFIG_ACPI_PROCESSOR),) diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 97ca17189af5..1aaea6ab8c46 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -24,12 +24,14 @@ */ #include <linux/init.h> -#include <linux/config.h> #include <linux/acpi.h> #include <linux/efi.h> +#include <linux/cpumask.h> #include <linux/module.h> #include <linux/dmi.h> #include <linux/irq.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -37,11 +39,17 @@ #include <asm/io.h> #include <asm/mpspec.h> -#ifdef CONFIG_X86_64 +static int __initdata acpi_force = 0; -extern void __init clustered_apic_check(void); +#ifdef CONFIG_ACPI +int acpi_disabled = 0; +#else +int acpi_disabled = 1; +#endif +EXPORT_SYMBOL(acpi_disabled); + +#ifdef CONFIG_X86_64 -extern int gsi_irq_sharing(int gsi); #include <asm/proto.h> static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; } @@ -60,7 +68,7 @@ static inline int gsi_irq_sharing(int gsi) { return gsi; } #define BAD_MADT_ENTRY(entry, end) ( \ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ - ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) + ((acpi_table_entry_header *)entry)->length < sizeof(*entry)) #define PREFIX "ACPI: " @@ -507,16 +515,76 @@ EXPORT_SYMBOL(acpi_register_gsi); #ifdef CONFIG_ACPI_HOTPLUG_CPU int acpi_map_lsapic(acpi_handle handle, int *pcpu) { - /* TBD */ - return -EINVAL; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_table_lapic *lapic; + cpumask_t tmp_map, new_map; + u8 physid; + int cpu; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_table_lapic *)obj->buffer.pointer; + + if ((lapic->header.type != ACPI_MADT_LAPIC) || + (!lapic->flags.enabled)) { + kfree(buffer.pointer); + return -EINVAL; + } + + physid = lapic->id; + + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + tmp_map = cpu_present_map; + mp_register_lapic(physid, lapic->flags.enabled); + + /* + * If mp_register_lapic successfully generates a new logical cpu + * number, then the following will get us exactly what was mapped + */ + cpus_andnot(new_map, cpu_present_map, tmp_map); + if (cpus_empty(new_map)) { + printk ("Unable to map lapic to logical cpu number\n"); + return -EINVAL; + } + + cpu = first_cpu(new_map); + + *pcpu = cpu; + return 0; } EXPORT_SYMBOL(acpi_map_lsapic); int acpi_unmap_lsapic(int cpu) { - /* TBD */ - return -EINVAL; + int i; + + for_each_possible_cpu(i) { + if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) { + x86_acpiid_to_apicid[i] = -1; + break; + } + } + x86_cpu_to_apicid[cpu] = -1; + cpu_clear(cpu, cpu_present_map); + num_processors--; + + return (0); } EXPORT_SYMBOL(acpi_unmap_lsapic); @@ -580,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { struct acpi_table_hpet *hpet_tbl; + struct resource *hpet_res; + resource_size_t res_start; if (!phys || !size) return -EINVAL; @@ -595,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) "memory.\n"); return -1; } + +#define HPET_RESOURCE_NAME_SIZE 9 + hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); + if (hpet_res) { + memset(hpet_res, 0, sizeof(*hpet_res)); + hpet_res->name = (void *)&hpet_res[1]; + hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, + "HPET %u", hpet_tbl->number); + hpet_res->end = (1 * 1024) - 1; + } + #ifdef CONFIG_X86_64 vxtime.hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, vxtime.hpet_address); + + res_start = vxtime.hpet_address; #else /* X86 */ { extern unsigned long hpet_address; @@ -608,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) hpet_address = hpet_tbl->addr.addrl; printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", hpet_tbl->id, hpet_address); + + res_start = hpet_address; } #endif /* X86 */ + if (hpet_res) { + hpet_res->start = res_start; + hpet_res->end += res_start; + insert_resource(&iomem_resource, hpet_res); + } + return 0; } #else @@ -861,8 +953,6 @@ static void __init acpi_process_madt(void) return; } -extern int acpi_force; - #ifdef __i386__ static int __init disable_acpi_irq(struct dmi_system_id *d) @@ -1164,3 +1254,75 @@ int __init acpi_boot_init(void) return 0; } + +static int __init parse_acpi(char *arg) +{ + if (!arg) + return -EINVAL; + + /* "acpi=off" disables both ACPI table parsing and interpreter */ + if (strcmp(arg, "off") == 0) { + disable_acpi(); + } + /* acpi=force to over-ride black-list */ + else if (strcmp(arg, "force") == 0) { + acpi_force = 1; + acpi_ht = 1; + acpi_disabled = 0; + } + /* acpi=strict disables out-of-spec workarounds */ + else if (strcmp(arg, "strict") == 0) { + acpi_strict = 1; + } + /* Limit ACPI just to boot-time to enable HT */ + else if (strcmp(arg, "ht") == 0) { + if (!acpi_force) + disable_acpi(); + acpi_ht = 1; + } + /* "acpi=noirq" disables ACPI interrupt routing */ + else if (strcmp(arg, "noirq") == 0) { + acpi_noirq_set(); + } else { + /* Core will printk when we return error. */ + return -EINVAL; + } + return 0; +} +early_param("acpi", parse_acpi); + +/* FIXME: Using pci= for an ACPI parameter is a travesty. */ +static int __init parse_pci(char *arg) +{ + if (arg && strcmp(arg, "noacpi") == 0) + acpi_disable_pci(); + return 0; +} +early_param("pci", parse_pci); + +#ifdef CONFIG_X86_IO_APIC +static int __init parse_acpi_skip_timer_override(char *arg) +{ + acpi_skip_timer_override = 1; + return 0; +} +early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); +#endif /* CONFIG_X86_IO_APIC */ + +static int __init setup_acpi_sci(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "edge")) + acpi_sci_flags.trigger = 1; + else if (!strcmp(s, "level")) + acpi_sci_flags.trigger = 3; + else if (!strcmp(s, "high")) + acpi_sci_flags.polarity = 1; + else if (!strcmp(s, "low")) + acpi_sci_flags.polarity = 3; + else + return -EINVAL; + return 0; +} +early_param("acpi_sci", setup_acpi_sci); diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 1649a175a206..fe799b11ac0a 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -48,7 +48,11 @@ void __init check_acpi_pci(void) int num, slot, func; /* Assume the machine supports type 1. If not it will - always read ffffffff and should not have any side effect. */ + always read ffffffff and should not have any side effect. + Actually a few buggy systems can machine check. Allow the user + to disable it by command line option at least -AK */ + if (!early_pci_allowed()) + return; /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index 9f408eee4e6f..b781b38131c0 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -292,7 +292,10 @@ ENTRY(do_suspend_lowlevel) pushl $3 call acpi_enter_sleep_state addl $4, %esp - ret + +# In case of S3 failure, we'll emerge here. Jump +# to ret_point to recover + jmp ret_point .p2align 4,,7 ret_point: call restore_registers diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 5cbd6f99fb2a..28ab80649764 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -4,27 +4,41 @@ #include <asm/alternative.h> #include <asm/sections.h> -#define DEBUG 0 -#if DEBUG -# define DPRINTK(fmt, args...) printk(fmt, args) -#else -# define DPRINTK(fmt, args...) -#endif +static int no_replacement = 0; +static int smp_alt_once = 0; +static int debug_alternative = 0; + +static int __init noreplacement_setup(char *s) +{ + no_replacement = 1; + return 1; +} +static int __init bootonly(char *str) +{ + smp_alt_once = 1; + return 1; +} +static int __init debug_alt(char *str) +{ + debug_alternative = 1; + return 1; +} + +__setup("noreplacement", noreplacement_setup); +__setup("smp-alt-boot", bootonly); +__setup("debug-alternative", debug_alt); + +#define DPRINTK(fmt, args...) if (debug_alternative) \ + printk(KERN_DEBUG fmt, args) +#ifdef GENERIC_NOP1 /* Use inline assembly to define this because the nops are defined as inline assembly strings in the include files and we cannot get them easily into strings. */ asm("\t.data\nintelnops: " GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 GENERIC_NOP7 GENERIC_NOP8); -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); -asm("\t.data\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); - -extern unsigned char intelnops[], k8nops[], k7nops[]; +extern unsigned char intelnops[]; static unsigned char *intel_nops[ASM_NOP_MAX+1] = { NULL, intelnops, @@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = { intelnops + 1 + 2 + 3 + 4 + 5 + 6, intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef K8_NOP1 +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +extern unsigned char k8nops[]; static unsigned char *k8_nops[ASM_NOP_MAX+1] = { NULL, k8nops, @@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = { k8nops + 1 + 2 + 3 + 4 + 5 + 6, k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef K7_NOP1 +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); +extern unsigned char k7nops[]; static unsigned char *k7_nops[ASM_NOP_MAX+1] = { NULL, k7nops, @@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = { k7nops + 1 + 2 + 3 + 4 + 5 + 6, k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, }; +#endif + +#ifdef CONFIG_X86_64 + +extern char __vsyscall_0; +static inline unsigned char** find_nop_table(void) +{ + return k8_nops; +} + +#else /* CONFIG_X86_64 */ + static struct nop { int cpuid; unsigned char **noptable; @@ -67,14 +107,6 @@ static struct nop { { -1, NULL } }; - -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; - -extern u8 __smp_alt_begin[], __smp_alt_end[]; - - static unsigned char** find_nop_table(void) { unsigned char **noptable = intel_nops; @@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void) return noptable; } +#endif /* CONFIG_X86_64 */ + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; +extern u8 *__smp_locks[], *__smp_locks_end[]; + +extern u8 __smp_alt_begin[], __smp_alt_end[]; + /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where @@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { unsigned char **noptable = find_nop_table(); struct alt_instr *a; + u8 *instr; int diff, i, k; DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); @@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) BUG_ON(a->replacementlen > a->instrlen); if (!boot_cpu_has(a->cpuid)) continue; - memcpy(a->instr, a->replacement, a->replacementlen); + instr = a->instr; +#ifdef CONFIG_X86_64 + /* vsyscall code is not mapped yet. resolve it manually. */ + if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { + instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); + DPRINTK("%s: vsyscall fixup: %p => %p\n", + __FUNCTION__, a->instr, instr); + } +#endif + memcpy(instr, a->replacement, a->replacementlen); diff = a->instrlen - a->replacementlen; /* Pad the rest with nops */ for (i = a->replacementlen; diff > 0; diff -= k, i += k) { @@ -118,6 +168,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end) } } +#ifdef CONFIG_SMP + static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; @@ -186,14 +238,6 @@ struct smp_alt_module { static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); -static int smp_alt_once = 0; -static int __init bootonly(char *str) -{ - smp_alt_once = 1; - return 1; -} -__setup("smp-alt-boot", bootonly); - void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) @@ -201,6 +245,9 @@ void alternatives_smp_module_add(struct module *mod, char *name, struct smp_alt_module *smp; unsigned long flags; + if (no_replacement) + return; + if (smp_alt_once) { if (boot_cpu_has(X86_FEATURE_UP)) alternatives_smp_unlock(locks, locks_end, @@ -235,7 +282,7 @@ void alternatives_smp_module_del(struct module *mod) struct smp_alt_module *item; unsigned long flags; - if (smp_alt_once) + if (no_replacement || smp_alt_once) return; spin_lock_irqsave(&smp_alt, flags); @@ -256,7 +303,17 @@ void alternatives_smp_switch(int smp) struct smp_alt_module *mod; unsigned long flags; - if (smp_alt_once) +#ifdef CONFIG_LOCKDEP + /* + * A not yet fixed binutils section handling bug prevents + * alternatives-replacement from working reliably, so turn + * it off: + */ + printk("lockdep: not fixing up alternatives.\n"); + return; +#endif + + if (no_replacement || smp_alt_once) return; BUG_ON(!smp && (num_online_cpus() > 1)); @@ -283,8 +340,17 @@ void alternatives_smp_switch(int smp) spin_unlock_irqrestore(&smp_alt, flags); } +#endif + void __init alternative_instructions(void) { + if (no_replacement) { + printk(KERN_INFO "(SMP-)alternatives turned off\n"); + free_init_pages("SMP alternatives", + (unsigned long)__smp_alt_begin, + (unsigned long)__smp_alt_end); + return; + } apply_alternatives(__alt_instructions, __alt_instructions_end); /* switch to patch-once-at-boottime-only mode and free the @@ -297,6 +363,7 @@ void __init alternative_instructions(void) smp_alt_once = 1; #endif +#ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); @@ -318,4 +385,5 @@ void __init alternative_instructions(void) _text, _etext); alternatives_smp_switch(0); } +#endif } diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 5ab59c12335b..90faae5c5d30 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -14,7 +14,6 @@ * Mikael Pettersson : PM converted to driver model. */ -#include <linux/config.h> #include <linux/init.h> #include <linux/mm.h> @@ -36,6 +35,7 @@ #include <asm/arch_hooks.h> #include <asm/hpet.h> #include <asm/i8253.h> +#include <asm/nmi.h> #include <mach_apic.h> #include <mach_apicdef.h> @@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi; /* * Knob to control our willingness to enable the local APIC. */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +static inline void lapic_disable(void) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +} + +static inline void lapic_enable(void) +{ + enable_local_apic = 1; +} /* * Debug level @@ -156,7 +167,7 @@ void clear_local_APIC(void) maxlvt = get_maxlvt(); /* - * Masking an LVT entry on a P6 can trigger a local APIC error + * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { @@ -586,8 +597,7 @@ void __devinit setup_local_APIC(void) printk("No ESR for 82489DX.\n"); } - if (nmi_watchdog == NMI_LOCAL_APIC) - setup_apic_nmi_watchdog(); + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } @@ -1117,7 +1127,18 @@ void disable_APIC_timer(void) unsigned long v; v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + /* + * When an illegal vector value (0-15) is written to an LVT + * entry and delivery mode is Fixed, the APIC may signal an + * illegal vector error, with out regard to whether the mask + * bit is set or whether an interrupt is actually seen on input. + * + * Boot sequence might call this function when the LVTT has + * '0' vector value. So make sure vector field is set to + * valid value. + */ + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); } } @@ -1362,3 +1383,18 @@ int __init APIC_init_uniprocessor (void) return 0; } + +static int __init parse_lapic(char *arg) +{ + lapic_enable(); + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + lapic_disable(); + return 0; +} +early_param("nolapic", parse_nolapic); + diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 9e819eb68229..b42f2d914af3 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -201,7 +201,6 @@ * http://www.microsoft.com/hwdev/busbios/amp_12.htm] */ -#include <linux/config.h> #include <linux/module.h> #include <linux/poll.h> @@ -226,6 +225,7 @@ #include <linux/smp_lock.h> #include <linux/dmi.h> #include <linux/suspend.h> +#include <linux/kthread.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -403,8 +403,6 @@ static int realmode_power_off = 1; #else static int realmode_power_off; #endif -static int exit_kapmd __read_mostly; -static int kapmd_running __read_mostly; #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else @@ -420,6 +418,8 @@ static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; static const char driver_version[] = "1.16ac"; /* no spaces */ +static struct task_struct *kapmd_task; + /* * APM event names taken from the APM 1.2 specification. These are * the message codes that the BIOS uses to tell us about events @@ -764,9 +764,9 @@ static int apm_do_idle(void) int idled = 0; int polling; - polling = test_thread_flag(TIF_POLLING_NRFLAG); + polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); } if (!need_resched()) { @@ -774,7 +774,7 @@ static int apm_do_idle(void) ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } if (polling) - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; if (!idled) return 0; @@ -1155,9 +1155,11 @@ out: static void set_time(void) { + struct timespec ts; if (got_clock_diff) { /* Must know time zone in order to set clock */ - xtime.tv_sec = get_cmos_time() + clock_cmos_diff; - xtime.tv_nsec = 0; + ts.tv_sec = get_cmos_time() + clock_cmos_diff; + ts.tv_nsec = 0; + do_settimeofday(&ts); } } @@ -1233,13 +1235,8 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - write_seqlock(&xtime_lock); - spin_lock(&i8253_lock); - reinit_timer(); set_time(); - - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); + reinit_timer(); if (err == APM_NO_ERROR) err = APM_SUCCESS; @@ -1366,9 +1363,7 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - write_seqlock_irq(&xtime_lock); set_time(); - write_sequnlock_irq(&xtime_lock); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1384,9 +1379,7 @@ static void check_events(void) break; case APM_UPDATE_TIME: - write_seqlock_irq(&xtime_lock); set_time(); - write_sequnlock_irq(&xtime_lock); break; case APM_CRITICAL_SUSPEND: @@ -1431,7 +1424,7 @@ static void apm_mainloop(void) set_current_state(TASK_INTERRUPTIBLE); for (;;) { schedule_timeout(APM_CHECK_TIMEOUT); - if (exit_kapmd) + if (kthread_should_stop()) break; /* * Ok, check all events, check for idle (and mark us sleeping @@ -1714,12 +1707,6 @@ static int apm(void *unused) char * power_stat; char * bat_stat; - kapmd_running = 1; - - daemonize("kapmd"); - - current->flags |= PF_NOFREEZE; - #ifdef CONFIG_SMP /* 2002/08/01 - WT * This is to avoid random crashes at boot time during initialization @@ -1829,7 +1816,6 @@ static int apm(void *unused) console_blank_hook = NULL; #endif } - kapmd_running = 0; return 0; } @@ -2228,7 +2214,7 @@ static int __init apm_init(void) { struct proc_dir_entry *apm_proc; struct desc_struct *gdt; - int ret; + int err; dmi_check_system(apm_dmi_table); @@ -2337,11 +2323,17 @@ static int __init apm_init(void) if (apm_proc) apm_proc->owner = THIS_MODULE; - ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD); - if (ret < 0) { - printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n"); - return -ENOMEM; + kapmd_task = kthread_create(apm, NULL, "kapmd"); + if (IS_ERR(kapmd_task)) { + printk(KERN_ERR "apm: disabled - Unable to start kernel " + "thread.\n"); + err = PTR_ERR(kapmd_task); + kapmd_task = NULL; + remove_proc_entry("apm", NULL); + return err; } + kapmd_task->flags |= PF_NOFREEZE; + wake_up_process(kapmd_task); if (num_online_cpus() > 1 && !smp ) { printk(KERN_NOTICE @@ -2349,7 +2341,13 @@ static int __init apm_init(void) return 0; } - misc_register(&apm_device); + /* + * Note we don't actually care if the misc_device cannot be registered. + * this driver can do its job without it, even if userspace can't + * control it. just log the error + */ + if (misc_register(&apm_device)) + printk(KERN_WARNING "apm: Could not register misc device.\n"); if (HZ != 100) idle_period = (idle_period * HZ) / 100; @@ -2385,9 +2383,10 @@ static void __exit apm_exit(void) remove_proc_entry("apm", NULL); if (power_off) pm_power_off = NULL; - exit_kapmd = 1; - while (kapmd_running) - schedule(); + if (kapmd_task) { + kthread_stop(kapmd_task); + kapmd_task = NULL; + } #ifdef CONFIG_PM_LEGACY pm_active = 0; #endif diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c index 36d66e2077d0..c80271f8f084 100644 --- a/arch/i386/kernel/asm-offsets.c +++ b/arch/i386/kernel/asm-offsets.c @@ -4,6 +4,7 @@ * to extract and format the required data. */ +#include <linux/crypto.h> #include <linux/sched.h> #include <linux/signal.h> #include <linux/personality.h> @@ -13,6 +14,7 @@ #include <asm/fixmap.h> #include <asm/processor.h> #include <asm/thread_info.h> +#include <asm/elf.h> #define DEFINE(sym, val) \ asm volatile("\n->" #sym " %0 " #val : : "i" (val)) @@ -53,6 +55,7 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); OFFSET(TI_restart_block, thread_info, restart_block); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); @@ -68,5 +71,7 @@ void foo(void) sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); - DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL)); + DEFINE(VDSO_PRELINK, VDSO_PRELINK); + + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); } diff --git a/arch/i386/kernel/bootflag.c b/arch/i386/kernel/bootflag.c index 4c30ed01f4e1..0b9860530a6b 100644 --- a/arch/i386/kernel/bootflag.c +++ b/arch/i386/kernel/bootflag.c @@ -3,7 +3,6 @@ */ -#include <linux/config.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 786d1a57048b..e4758095d87a 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -22,7 +22,7 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -static void __init init_amd(struct cpuinfo_x86 *c) +static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); @@ -224,25 +224,29 @@ static void __init init_amd(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_HT /* - * On a AMD dual core setup the lower bits of the APIC id - * distingush the cores. Assumes number of cores is a power - * of two. + * On a AMD multi core setup the lower bits of the APIC id + * distingush the cores. */ if (c->x86_max_cores > 1) { int cpu = smp_processor_id(); - unsigned bits = 0; - while ((1 << bits) < c->x86_max_cores) - bits++; - cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1); - phys_proc_id[cpu] >>= bits; + unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1); + c->phys_proc_id >>= bits; printk(KERN_INFO "CPU %d(%d) -> Core %d\n", - cpu, c->x86_max_cores, cpu_core_id[cpu]); + cpu, c->x86_max_cores, c->cpu_core_id); } #endif + if (cpuid_eax(0x80000000) >= 0x80000006) + num_cache_leaves = 3; } -static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* AMD errata T13 (order #21922) */ if ((c->x86 == 6)) { @@ -255,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev amd_cpu_dev __initdata = { +static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, .c_models = { @@ -271,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = { }, }, .c_init = init_amd, - .c_identify = generic_identify, .c_size_cache = amd_size_cache, }; diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c index bd75629dd262..8c25047975c0 100644 --- a/arch/i386/kernel/cpu/centaur.c +++ b/arch/i386/kernel/cpu/centaur.c @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_OOSTORE -static u32 __init power2(u32 x) +static u32 __cpuinit power2(u32 x) { u32 s=1; while(s<=x) @@ -22,7 +22,7 @@ static u32 __init power2(u32 x) * Set up an actual MCR */ -static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) +static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key) { u32 lo, hi; @@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key) * Shortcut: We know you can't put 4Gig of RAM on a winchip */ -static u32 __init ramtop(void) /* 16388 */ +static u32 __cpuinit ramtop(void) /* 16388 */ { int i; u32 top = 0; @@ -91,7 +91,7 @@ static u32 __init ramtop(void) /* 16388 */ * Compute a set of MCR's to give maximum coverage */ -static int __init centaur_mcr_compute(int nr, int key) +static int __cpuinit centaur_mcr_compute(int nr, int key) { u32 mem = ramtop(); u32 root = power2(mem); @@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key) return ct; } -static void __init centaur_create_optimal_mcr(void) +static void __cpuinit centaur_create_optimal_mcr(void) { int i; /* @@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void) wrmsr(MSR_IDT_MCR0+i, 0, 0); } -static void __init winchip2_create_optimal_mcr(void) +static void __cpuinit winchip2_create_optimal_mcr(void) { u32 lo, hi; int i; @@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void) * Handle the MCR key on the Winchip 2. */ -static void __init winchip2_unprotect_mcr(void) +static void __cpuinit winchip2_unprotect_mcr(void) { u32 lo, hi; u32 key; @@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void) wrmsr(MSR_IDT_MCR_CTRL, lo, hi); } -static void __init winchip2_protect_mcr(void) +static void __cpuinit winchip2_protect_mcr(void) { u32 lo, hi; @@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void) #define RNG_ENABLED (1 << 3) #define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */ -static void __init init_c3(struct cpuinfo_x86 *c) +static void __cpuinit init_c3(struct cpuinfo_x86 *c) { u32 lo, hi; @@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c) display_cacheinfo(c); } -static void __init init_centaur(struct cpuinfo_x86 *c) +static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { enum { ECX8=1<<1, @@ -442,7 +442,7 @@ static void __init init_centaur(struct cpuinfo_x86 *c) } } -static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* VIA C3 CPUs (670-68F) need further shifting. */ if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8))) @@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size return size; } -static struct cpu_dev centaur_cpu_dev __initdata = { +static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, .c_init = init_centaur, diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index 44f2c5f2dda1..2799baaadf45 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; extern int disable_pse; -static void default_init(struct cpuinfo_x86 * c) +static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c) } } -static struct cpu_dev default_cpu = { +static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", }; @@ -265,7 +265,7 @@ static void __init early_cpu_detect(void) } } -void __cpuinit generic_identify(struct cpuinfo_x86 * c) +static void __cpuinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int ebx; @@ -294,7 +294,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; c->x86_mask = tfms & 15; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0); #else c->apicid = (ebx >> 24) & 0xFF; @@ -319,7 +319,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c) early_intel_workaround(c); #ifdef CONFIG_X86_HT - phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff; + c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif } @@ -477,11 +477,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, core_bits; - int cpu = smp_processor_id(); cpuid(1, &eax, &ebx, &ecx, &edx); - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) return; @@ -492,16 +490,17 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } else if (smp_num_siblings > 1 ) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of the " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } index_msb = get_count_order(smp_num_siblings); - phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); + c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); + c->phys_proc_id); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -509,12 +508,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); - cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & + c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) & ((1 << core_bits) - 1); if (c->x86_max_cores > 1) printk(KERN_INFO "CPU: Processor Core ID: %d\n", - cpu_core_id[cpu]); + c->cpu_core_id); } } #endif @@ -613,6 +612,12 @@ void __cpuinit cpu_init(void) set_in_cr4(X86_CR4_TSD); } + /* The CPU hotplug case */ + if (cpu_gdt_descr->address) { + gdt = (struct desc_struct *)cpu_gdt_descr->address; + memset(gdt, 0, PAGE_SIZE); + goto old_gdt; + } /* * This is a horrible hack to allocate the GDT. The problem * is that cpu_init() is called really early for the boot CPU @@ -631,7 +636,7 @@ void __cpuinit cpu_init(void) local_irq_enable(); } } - +old_gdt: /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: @@ -670,7 +675,7 @@ void __cpuinit cpu_init(void) #endif /* Clear %fs and %gs. */ - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h index 5a1d4f163e84..2f6432cef6ff 100644 --- a/arch/i386/kernel/cpu/cpu.h +++ b/arch/i386/kernel/cpu/cpu.h @@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void generic_identify(struct cpuinfo_x86 * c); - extern void early_intel_workaround(struct cpuinfo_x86 *c); diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig index e44a4c6a4fe5..ccc1edff5c97 100644 --- a/arch/i386/kernel/cpu/cpufreq/Kconfig +++ b/arch/i386/kernel/cpu/cpufreq/Kconfig @@ -96,6 +96,7 @@ config X86_POWERNOW_K8_ACPI config X86_GX_SUSPMOD tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation" + depends on PCI help This add the CPUFreq driver for NatSemi Geode processors which support suspend modulation. @@ -202,7 +203,7 @@ config X86_LONGRUN config X86_LONGHAUL tristate "VIA Cyrix III Longhaul" select CPU_FREQ_TABLE - depends on BROKEN + depends on ACPI_PROCESSOR help This adds the CPUFreq driver for VIA Samuel/CyrixIII, VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 05668e3598c0..ea19d091fd41 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -24,7 +24,6 @@ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -33,6 +32,7 @@ #include <linux/seq_file.h> #include <linux/compiler.h> #include <linux/sched.h> /* current */ +#include <linux/dmi.h> #include <asm/io.h> #include <asm/delay.h> #include <asm/uaccess.h> @@ -371,11 +371,11 @@ static int acpi_cpufreq_early_init_acpi(void) dprintk("acpi_cpufreq_early_init\n"); - for_each_cpu(i) { + for_each_possible_cpu(i) { data = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); if (!data) { - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } @@ -385,10 +385,36 @@ static int acpi_cpufreq_early_init_acpi(void) } /* Do initialization in ACPI core */ - acpi_processor_preregister_performance(acpi_perf_data); + return acpi_processor_preregister_performance(acpi_perf_data); +} + +/* + * Some BIOSes do SW_ANY coordination internally, either set it up in hw + * or do it in BIOS firmware and won't inform about it to OS. If not + * detected, this has a side effect of making CPU run at a different speed + * than OS intended it to run at. Detect it and handle it cleanly. + */ +static int bios_with_sw_any_bug; + +static int __init sw_any_bug_found(struct dmi_system_id *d) +{ + bios_with_sw_any_bug = 1; return 0; } +static struct dmi_system_id __initdata sw_any_bug_dmi_table[] = { + { + .callback = sw_any_bug_found, + .ident = "Supermicro Server X6DLP", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_BIOS_VERSION, "080010"), + DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), + }, + }, + { } +}; + static int acpi_cpufreq_cpu_init ( struct cpufreq_policy *policy) @@ -418,8 +444,23 @@ acpi_cpufreq_cpu_init ( goto err_free; perf = data->acpi_data; - policy->cpus = perf->shared_cpu_map; policy->shared_type = perf->shared_type; + /* + * Will let policy->cpus know about dependency only when software + * coordination is required. + */ + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || + policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { + policy->cpus = perf->shared_cpu_map; + } + +#ifdef CONFIG_SMP + dmi_check_system(sw_any_bug_dmi_table); + if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) { + policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; + policy->cpus = cpu_core_map[cpu]; + } +#endif if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -563,16 +604,11 @@ static struct cpufreq_driver acpi_cpufreq_driver = { static int __init acpi_cpufreq_init (void) { - int result = 0; - dprintk("acpi_cpufreq_init\n"); - result = acpi_cpufreq_early_init_acpi(); + acpi_cpufreq_early_init_acpi(); - if (!result) - result = cpufreq_register_driver(&acpi_cpufreq_driver); - - return (result); + return cpufreq_register_driver(&acpi_cpufreq_driver); } @@ -584,7 +620,7 @@ acpi_cpufreq_exit (void) cpufreq_unregister_driver(&acpi_cpufreq_driver); - for_each_cpu(i) { + for_each_possible_cpu(i) { kfree(acpi_perf_data[i]); acpi_perf_data[i] = NULL; } diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c index 146f607e9c44..f5cc9f5c9bab 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.c +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c @@ -27,13 +27,16 @@ #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/cpufreq.h> +#include <linux/pci.h> #include <linux/slab.h> #include <linux/string.h> -#include <linux/pci.h> #include <asm/msr.h> #include <asm/timex.h> #include <asm/io.h> +#include <asm/acpi.h> +#include <linux/acpi.h> +#include <acpi/processor.h> #include "longhaul.h" @@ -50,16 +53,26 @@ #define CPU_NEHEMIAH 5 static int cpu_model; -static unsigned int numscales=16, numvscales; +static unsigned int numscales=16; static unsigned int fsb; -static int minvid, maxvid; + +static struct mV_pos *vrm_mV_table; +static unsigned char *mV_vrm_table; +struct f_msr { + unsigned char vrm; +}; +static struct f_msr f_msr_table[32]; + +static unsigned int highest_speed, lowest_speed; /* kHz */ static unsigned int minmult, maxmult; static int can_scale_voltage; -static int vrmrev; +static struct acpi_processor *pr = NULL; +static struct acpi_processor_cx *cx = NULL; +static int port22_en; /* Module parameters */ -static int dont_scale_voltage; - +static int scale_voltage; +static int ignore_latency; #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg) @@ -67,7 +80,6 @@ static int dont_scale_voltage; /* Clock ratios multiplied by 10 */ static int clock_ratio[32]; static int eblcr_table[32]; -static int voltage_table[32]; static unsigned int highest_speed, lowest_speed; /* kHz */ static int longhaul_version; static struct cpufreq_frequency_table *longhaul_table; @@ -118,84 +130,67 @@ static int longhaul_get_cpu_mult(void) return eblcr_table[invalue]; } +/* For processor with BCR2 MSR */ -static void do_powersaver(union msr_longhaul *longhaul, - unsigned int clock_ratio_index) +static void do_longhaul1(unsigned int clock_ratio_index) { - struct pci_dev *dev; - unsigned long flags; - unsigned int tmp_mask; - int version; - int i; - u16 pci_cmd; - u16 cmd_state[64]; - - switch (cpu_model) { - case CPU_EZRA_T: - version = 3; - break; - case CPU_NEHEMIAH: - version = 0xf; - break; - default: - return; - } - - rdmsrl(MSR_VIA_LONGHAUL, longhaul->val); - longhaul->bits.SoftBusRatio = clock_ratio_index & 0xf; - longhaul->bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; - longhaul->bits.EnableSoftBusRatio = 1; - longhaul->bits.RevisionKey = 0; - - preempt_disable(); - local_irq_save(flags); - - /* - * get current pci bus master state for all devices - * and clear bus master bit - */ - dev = NULL; - i = 0; - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (dev != NULL) { - pci_read_config_word(dev, PCI_COMMAND, &pci_cmd); - cmd_state[i++] = pci_cmd; - pci_cmd &= ~PCI_COMMAND_MASTER; - pci_write_config_word(dev, PCI_COMMAND, pci_cmd); - } - } while (dev != NULL); + union msr_bcr2 bcr2; - tmp_mask=inb(0x21); /* works on C3. save mask. */ - outb(0xFE,0x21); /* TMR0 only */ - outb(0xFF,0x80); /* delay */ + rdmsrl(MSR_VIA_BCR2, bcr2.val); + /* Enable software clock multiplier */ + bcr2.bits.ESOFTBF = 1; + bcr2.bits.CLOCKMUL = clock_ratio_index; + /* Sync to timer tick */ safe_halt(); - wrmsrl(MSR_VIA_LONGHAUL, longhaul->val); + /* Change frequency on next halt or sleep */ + wrmsrl(MSR_VIA_BCR2, bcr2.val); + /* Invoke transition */ + ACPI_FLUSH_CPU_CACHE(); halt(); + /* Disable software clock multiplier */ local_irq_disable(); + rdmsrl(MSR_VIA_BCR2, bcr2.val); + bcr2.bits.ESOFTBF = 0; + wrmsrl(MSR_VIA_BCR2, bcr2.val); +} - outb(tmp_mask,0x21); /* restore mask */ +/* For processor with Longhaul MSR */ - /* restore pci bus master state for all devices */ - dev = NULL; - i = 0; - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (dev != NULL) { - pci_cmd = cmd_state[i++]; - pci_write_config_byte(dev, PCI_COMMAND, pci_cmd); - } - } while (dev != NULL); - local_irq_restore(flags); - preempt_enable(); +static void do_powersaver(int cx_address, unsigned int clock_ratio_index) +{ + union msr_longhaul longhaul; + u32 t; + + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + longhaul.bits.RevisionKey = longhaul.bits.RevisionID; + longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf; + longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4; + longhaul.bits.EnableSoftBusRatio = 1; + + if (can_scale_voltage) { + longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm; + longhaul.bits.EnableSoftVID = 1; + } - /* disable bus ratio bit */ - rdmsrl(MSR_VIA_LONGHAUL, longhaul->val); - longhaul->bits.EnableSoftBusRatio = 0; - longhaul->bits.RevisionKey = version; - wrmsrl(MSR_VIA_LONGHAUL, longhaul->val); + /* Sync to timer tick */ + safe_halt(); + /* Change frequency on next halt or sleep */ + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); + ACPI_FLUSH_CPU_CACHE(); + /* Invoke C3 */ + inb(cx_address); + /* Dummy op - must do something useless after P_LVL3 read */ + t = inl(acpi_fadt.xpm_tmr_blk.address); + + /* Disable bus ratio bit */ + local_irq_disable(); + longhaul.bits.RevisionKey = longhaul.bits.RevisionID; + longhaul.bits.EnableSoftBusRatio = 0; + longhaul.bits.EnableSoftBSEL = 0; + longhaul.bits.EnableSoftVID = 0; + wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); } /** @@ -209,9 +204,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index) { int speed, mult; struct cpufreq_freqs freqs; - union msr_longhaul longhaul; - union msr_bcr2 bcr2; static unsigned int old_ratio=-1; + unsigned long flags; + unsigned int pic1_mask, pic2_mask; if (old_ratio == clock_ratio_index) return; @@ -234,6 +229,23 @@ static void longhaul_setstate(unsigned int clock_ratio_index) dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n", fsb, mult/10, mult%10, print_speed(speed/1000)); + preempt_disable(); + local_irq_save(flags); + + pic2_mask = inb(0xA1); + pic1_mask = inb(0x21); /* works on C3. save mask. */ + outb(0xFF,0xA1); /* Overkill */ + outb(0xFE,0x21); /* TMR0 only */ + + if (pr->flags.bm_control) { + /* Disable bus master arbitration */ + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, + ACPI_MTX_DO_NOT_LOCK); + } else if (port22_en) { + /* Disable AGP and PCI arbiters */ + outb(3, 0x22); + } + switch (longhaul_version) { /* @@ -245,20 +257,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index) */ case TYPE_LONGHAUL_V1: case TYPE_LONGHAUL_V2: - rdmsrl (MSR_VIA_BCR2, bcr2.val); - /* Enable software clock multiplier */ - bcr2.bits.ESOFTBF = 1; - bcr2.bits.CLOCKMUL = clock_ratio_index; - local_irq_disable(); - wrmsrl (MSR_VIA_BCR2, bcr2.val); - safe_halt(); - - /* Disable software clock multiplier */ - rdmsrl (MSR_VIA_BCR2, bcr2.val); - bcr2.bits.ESOFTBF = 0; - local_irq_disable(); - wrmsrl (MSR_VIA_BCR2, bcr2.val); - local_irq_enable(); + do_longhaul1(clock_ratio_index); break; /* @@ -273,10 +272,28 @@ static void longhaul_setstate(unsigned int clock_ratio_index) * to work in practice. */ case TYPE_POWERSAVER: - do_powersaver(&longhaul, clock_ratio_index); + /* Don't allow wakeup */ + acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, + ACPI_MTX_DO_NOT_LOCK); + do_powersaver(cx->address, clock_ratio_index); break; } + if (pr->flags.bm_control) { + /* Enable bus master arbitration */ + acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, + ACPI_MTX_DO_NOT_LOCK); + } else if (port22_en) { + /* Enable arbiters */ + outb(0, 0x22); + } + + outb(pic2_mask,0xA1); /* restore mask */ + outb(pic1_mask,0x21); + + local_irq_restore(flags); + preempt_enable(); + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -324,9 +341,11 @@ static int guess_fsb(void) static int __init longhaul_get_ranges(void) { unsigned long invalue; - unsigned int multipliers[32]= { - 50,30,40,100,55,35,45,95,90,70,80,60,120,75,85,65, - -1,110,120,-1,135,115,125,105,130,150,160,140,-1,155,-1,145 }; + unsigned int ezra_t_multipliers[32]= { + 90, 30, 40, 100, 55, 35, 45, 95, + 50, 70, 80, 60, 120, 75, 85, 65, + -1, 110, 120, -1, 135, 115, 125, 105, + 130, 150, 160, 140, -1, 155, -1, 145 }; unsigned int j, k = 0; union msr_longhaul longhaul; unsigned long lo, hi; @@ -355,13 +374,13 @@ static int __init longhaul_get_ranges(void) invalue = longhaul.bits.MaxMHzBR; if (longhaul.bits.MaxMHzBR4) invalue += 16; - maxmult=multipliers[invalue]; + maxmult=ezra_t_multipliers[invalue]; invalue = longhaul.bits.MinMHzBR; if (longhaul.bits.MinMHzBR4 == 1) minmult = 30; else - minmult = multipliers[invalue]; + minmult = ezra_t_multipliers[invalue]; fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB]; break; } @@ -446,53 +465,57 @@ static int __init longhaul_get_ranges(void) static void __init longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; + struct mV_pos minvid, maxvid; + unsigned int j, speed, pos, kHz_step, numvscales; - rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); - - if (!(longhaul.bits.RevisionID & 1)) + rdmsrl(MSR_VIA_LONGHAUL, longhaul.val); + if (!(longhaul.bits.RevisionID & 1)) { + printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n"); return; + } + + if (!longhaul.bits.VRMRev) { + printk (KERN_INFO PFX "VRM 8.5\n"); + vrm_mV_table = &vrm85_mV[0]; + mV_vrm_table = &mV_vrm85[0]; + } else { + printk (KERN_INFO PFX "Mobile VRM\n"); + vrm_mV_table = &mobilevrm_mV[0]; + mV_vrm_table = &mV_mobilevrm[0]; + } - minvid = longhaul.bits.MinimumVID; - maxvid = longhaul.bits.MaximumVID; - vrmrev = longhaul.bits.VRMRev; + minvid = vrm_mV_table[longhaul.bits.MinimumVID]; + maxvid = vrm_mV_table[longhaul.bits.MaximumVID]; + numvscales = maxvid.pos - minvid.pos + 1; + kHz_step = (highest_speed - lowest_speed) / numvscales; - if (minvid == 0 || maxvid == 0) { + if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) { printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. " "Voltage scaling disabled.\n", - minvid/1000, minvid%1000, maxvid/1000, maxvid%1000); + minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000); return; } - if (minvid == maxvid) { + if (minvid.mV == maxvid.mV) { printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are " "both %d.%03d. Voltage scaling disabled\n", - maxvid/1000, maxvid%1000); + maxvid.mV/1000, maxvid.mV%1000); return; } - if (vrmrev==0) { - dprintk ("VRM 8.5\n"); - memcpy (voltage_table, vrm85scales, sizeof(voltage_table)); - numvscales = (voltage_table[maxvid]-voltage_table[minvid])/25; - } else { - dprintk ("Mobile VRM\n"); - memcpy (voltage_table, mobilevrmscales, sizeof(voltage_table)); - numvscales = (voltage_table[maxvid]-voltage_table[minvid])/5; + printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n", + maxvid.mV/1000, maxvid.mV%1000, + minvid.mV/1000, minvid.mV%1000, + numvscales); + + j = 0; + while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) { + speed = longhaul_table[j].frequency; + pos = (speed - lowest_speed) / kHz_step + minvid.pos; + f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos]; + j++; } - /* Current voltage isn't readable at first, so we need to - set it to a known value. The spec says to use maxvid */ - longhaul.bits.RevisionKey = longhaul.bits.RevisionID; /* FIXME: This is bad. */ - longhaul.bits.EnableSoftVID = 1; - longhaul.bits.SoftVID = maxvid; - wrmsrl (MSR_VIA_LONGHAUL, longhaul.val); - - minvid = voltage_table[minvid]; - maxvid = voltage_table[maxvid]; - - dprintk ("Min VID=%d.%03d Max VID=%d.%03d, %d possible voltage scales\n", - maxvid/1000, maxvid%1000, minvid/1000, minvid%1000, numvscales); - can_scale_voltage = 1; } @@ -527,6 +550,38 @@ static unsigned int longhaul_get(unsigned int cpu) return calc_speed(longhaul_get_cpu_mult()); } +static acpi_status longhaul_walk_callback(acpi_handle obj_handle, + u32 nesting_level, + void *context, void **return_value) +{ + struct acpi_device *d; + + if ( acpi_bus_get_device(obj_handle, &d) ) { + return 0; + } + *return_value = (void *)acpi_driver_data(d); + return 1; +} + +/* VIA don't support PM2 reg, but have something similar */ +static int enable_arbiter_disable(void) +{ + struct pci_dev *dev; + u8 pci_cmd; + + /* Find PLE133 host bridge */ + dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, NULL); + if (dev != NULL) { + /* Enable access to port 0x22 */ + pci_read_config_byte(dev, 0x78, &pci_cmd); + if ( !(pci_cmd & 1<<7) ) { + pci_cmd |= 1<<7; + pci_write_config_byte(dev, 0x78, pci_cmd); + } + return 1; + } + return 0; +} static int __init longhaul_cpu_init(struct cpufreq_policy *policy) { @@ -534,6 +589,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) char *cpuname=NULL; int ret; + /* Check what we have on this motherboard */ switch (c->x86_model) { case 6: cpu_model = CPU_SAMUEL; @@ -615,12 +671,36 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) break; }; + /* Find ACPI data for processor */ + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, + &longhaul_walk_callback, NULL, (void *)&pr); + if (pr == NULL) + goto err_acpi; + + if (longhaul_version == TYPE_POWERSAVER) { + /* Check ACPI support for C3 state */ + cx = &pr->power.states[ACPI_STATE_C3]; + if (cx->address == 0 || + (cx->latency > 1000 && ignore_latency == 0) ) + goto err_acpi; + + } else { + /* Check ACPI support for bus master arbiter disable */ + if (!pr->flags.bm_control) { + if (!enable_arbiter_disable()) { + printk(KERN_ERR PFX "No ACPI support. No VT8601 host bridge. Aborting.\n"); + return -ENODEV; + } else + port22_en = 1; + } + } + ret = longhaul_get_ranges(); if (ret != 0) return ret; if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) && - (dont_scale_voltage==0)) + (scale_voltage != 0)) longhaul_setup_voltagescaling(); policy->governor = CPUFREQ_DEFAULT_GOVERNOR; @@ -634,6 +714,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); return 0; + +err_acpi: + printk(KERN_ERR PFX "No ACPI support for CPU frequency changes.\n"); + return -ENODEV; } static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) @@ -666,6 +750,18 @@ static int __init longhaul_init(void) if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6) return -ENODEV; +#ifdef CONFIG_SMP + if (num_online_cpus() > 1) { + return -ENODEV; + printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n"); + } +#endif +#ifdef CONFIG_X86_IO_APIC + if (cpu_has_apic) { + printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n"); + return -ENODEV; + } +#endif switch (c->x86_model) { case 6 ... 9: return cpufreq_register_driver(&longhaul_driver); @@ -692,13 +788,14 @@ static void __exit longhaul_exit(void) kfree(longhaul_table); } -module_param (dont_scale_voltage, int, 0644); -MODULE_PARM_DESC(dont_scale_voltage, "Don't scale voltage of processor"); +module_param (scale_voltage, int, 0644); +MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); +module_param(ignore_latency, int, 0644); +MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test"); MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); MODULE_LICENSE ("GPL"); -module_init(longhaul_init); +late_initcall(longhaul_init); module_exit(longhaul_exit); - diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h index d3a95d77ee85..bc4682aad69b 100644 --- a/arch/i386/kernel/cpu/cpufreq/longhaul.h +++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h @@ -450,17 +450,45 @@ static int __initdata nehemiah_c_eblcr[32] = { * Voltage scales. Div/Mod by 1000 to get actual voltage. * Which scale to use depends on the VRM type in use. */ -static int __initdata vrm85scales[32] = { - 1250, 1200, 1150, 1100, 1050, 1800, 1750, 1700, - 1650, 1600, 1550, 1500, 1450, 1400, 1350, 1300, - 1275, 1225, 1175, 1125, 1075, 1825, 1775, 1725, - 1675, 1625, 1575, 1525, 1475, 1425, 1375, 1325, + +struct mV_pos { + unsigned short mV; + unsigned short pos; +}; + +static struct mV_pos __initdata vrm85_mV[32] = { + {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, + {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, + {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, + {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10}, + {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3}, + {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27}, + {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19}, + {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} +}; + +static unsigned char __initdata mV_vrm85[32] = { + 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, + 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, + 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, + 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 +}; + +static struct mV_pos __initdata mobilevrm_mV[32] = { + {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, + {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, + {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, + {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16}, + {975, 15}, {950, 14}, {925, 13}, {900, 12}, + {875, 11}, {850, 10}, {825, 9}, {800, 8}, + {775, 7}, {750, 6}, {725, 5}, {700, 4}, + {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static int __initdata mobilevrmscales[32] = { - 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650, - 1600, 1550, 1500, 1450, 1500, 1350, 1300, -1, - 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100, - 1075, 1050, 1025, 1000, 975, 950, 925, -1, +static unsigned char __initdata mV_mobilevrm[32] = { + 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, + 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, + 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }; diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c index ab6504efd801..304d2eaa4a1b 100644 --- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c @@ -20,7 +20,6 @@ * */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c index 694d4793bf6a..54382760983a 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c @@ -12,7 +12,6 @@ * - We disable half multipliers if ACPI is used on A0 stepping CPUs. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/moduleparam.h> diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c index 31c3a5baaa7f..7a9325349e94 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -17,13 +17,13 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/cpufreq.h> -#include <linux/config.h> #include <linux/sched.h> /* current */ #include <linux/delay.h> #include <linux/compiler.h> #ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI #include <linux/acpi.h> +#include <linux/dmi.h> #include <acpi/processor.h> #endif @@ -361,11 +361,11 @@ static int centrino_cpu_early_init_acpi(void) unsigned int i, j; struct acpi_processor_performance *data; - for_each_cpu(i) { + for_each_possible_cpu(i) { data = kzalloc(sizeof(struct acpi_processor_performance), GFP_KERNEL); if (!data) { - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } @@ -378,6 +378,35 @@ static int centrino_cpu_early_init_acpi(void) return 0; } + +/* + * Some BIOSes do SW_ANY coordination internally, either set it up in hw + * or do it in BIOS firmware and won't inform about it to OS. If not + * detected, this has a side effect of making CPU run at a different speed + * than OS intended it to run at. Detect it and handle it cleanly. + */ +static int bios_with_sw_any_bug; +static int __init sw_any_bug_found(struct dmi_system_id *d) +{ + bios_with_sw_any_bug = 1; + return 0; +} + + +static struct dmi_system_id sw_any_bug_dmi_table[] = { + { + .callback = sw_any_bug_found, + .ident = "Supermicro Server X6DLP", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"), + DMI_MATCH(DMI_BIOS_VERSION, "080010"), + DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"), + }, + }, + { } +}; + + /* * centrino_cpu_init_acpi - register with ACPI P-States library * @@ -399,8 +428,24 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) dprintk(PFX "obtaining ACPI data failed\n"); return -EIO; } - policy->cpus = p->shared_cpu_map; + policy->shared_type = p->shared_type; + /* + * Will let policy->cpus know about dependency only when software + * coordination is required. + */ + if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || + policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) { + policy->cpus = p->shared_cpu_map; + } + +#ifdef CONFIG_SMP + dmi_check_system(sw_any_bug_dmi_table); + if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) { + policy->shared_type = CPUFREQ_SHARED_TYPE_ALL; + policy->cpus = cpu_core_map[cpu]; + } +#endif /* verify the acpi_data */ if (p->state_count <= 1) { @@ -805,7 +850,7 @@ static void __exit centrino_exit(void) cpufreq_unregister_driver(¢rino_driver); #ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI - for_each_cpu(j) { + for_each_possible_cpu(j) { kfree(acpi_perf_data[j]); acpi_perf_data[j] = NULL; } diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index fc32c8028e24..c0c3b59de32c 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -12,7 +12,7 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; unsigned long flags; @@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) * Actually since bugs.h doesn't even reference this perhaps someone should * fix the documentation ??? */ -static unsigned char Cx86_dir0_msb __initdata = 0; +static unsigned char Cx86_dir0_msb __cpuinitdata = 0; -static char Cx86_model[][9] __initdata = { +static char Cx86_model[][9] __cpuinitdata = { "Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ", "M II ", "Unknown" }; -static char Cx486_name[][5] __initdata = { +static char Cx486_name[][5] __cpuinitdata = { "SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx", "SRx2", "DRx2" }; -static char Cx486S_name[][4] __initdata = { +static char Cx486S_name[][4] __cpuinitdata = { "S", "S2", "Se", "S2e" }; -static char Cx486D_name[][4] __initdata = { +static char Cx486D_name[][4] __cpuinitdata = { "DX", "DX2", "?", "?", "?", "DX4" }; -static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock"; -static char cyrix_model_mult1[] __initdata = "12??43"; -static char cyrix_model_mult2[] __initdata = "12233445"; +static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock"; +static char cyrix_model_mult1[] __cpuinitdata = "12??43"; +static char cyrix_model_mult2[] __cpuinitdata = "12233445"; /* * Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old @@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445"; extern void calibrate_delay(void) __init; -static void __init check_cx686_slop(struct cpuinfo_x86 *c) +static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c) { unsigned long flags; @@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c) } -static void __init set_cx86_reorder(void) +static void __cpuinit set_cx86_reorder(void) { u8 ccr3; @@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void) setCx86(CX86_CCR3, ccr3); } -static void __init set_cx86_memwb(void) +static void __cpuinit set_cx86_memwb(void) { u32 cr0; @@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } -static void __init set_cx86_inc(void) +static void __cpuinit set_cx86_inc(void) { unsigned char ccr3; @@ -158,7 +158,7 @@ static void __init set_cx86_inc(void) * Configure later MediaGX and/or Geode processor. */ -static void __init geode_configure(void) +static void __cpuinit geode_configure(void) { unsigned long flags; u8 ccr3, ccr4; @@ -184,14 +184,14 @@ static void __init geode_configure(void) #ifdef CONFIG_PCI -static struct pci_device_id __initdata cyrix_55x0[] = { +static struct pci_device_id __cpuinitdata cyrix_55x0[] = { { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) }, { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) }, { }, }; #endif -static void __init init_cyrix(struct cpuinfo_x86 *c) +static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; char *buf = c->x86_model_id; @@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c) /* * Handle National Semiconductor branded processors */ -static void __init init_nsc(struct cpuinfo_x86 *c) +static void __cpuinit init_nsc(struct cpuinfo_x86 *c) { /* There may be GX1 processors in the wild that are branded * NSC and not Cyrix. @@ -354,7 +354,7 @@ static void __init init_nsc(struct cpuinfo_x86 *c) * This function only handles the GX processor, and kicks every * thing else to the Cyrix init function above - that should * cover any processors that might have been branded differently - * after NSC aquired Cyrix. + * after NSC acquired Cyrix. * * If this breaks your GX1 horribly, please e-mail * info-linux@ldcmail.amd.com to tell us. @@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void) return (unsigned char) (test >> 8) == 0x02; } -static void cyrix_identify(struct cpuinfo_x86 * c) +static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c) { /* Detect Cyrix with disabled CPUID */ if ( c->x86 == 4 && test_cyrix_52div() ) { @@ -427,10 +427,9 @@ static void cyrix_identify(struct cpuinfo_x86 * c) local_irq_restore(flags); } } - generic_identify(c); } -static struct cpu_dev cyrix_cpu_dev __initdata = { +static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, .c_init = init_cyrix, @@ -453,11 +452,10 @@ static int __init cyrix_exit_cpu(void) late_initcall(cyrix_exit_cpu); -static struct cpu_dev nsc_cpu_dev __initdata = { +static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, - .c_identify = generic_identify, }; int __init nsc_init_cpu(void) diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 5386b29bb5a5..94a95aa5227e 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -1,4 +1,3 @@ -#include <linux/config.h> #include <linux/init.h> #include <linux/kernel.h> @@ -122,6 +121,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) select_idle_routine(c); l2 = init_intel_cacheinfo(c); + if (c->cpuid_level > 9 ) { + unsigned eax = cpuid_eax(10); + /* Check for version and the number of counters */ + if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) + set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); + } /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) @@ -193,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } -static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) +static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) { /* Intel PIII Tualatin. This comes in two flavours. * One has 256kb of cache, the other 512. We have no way @@ -258,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { }, }, .c_init = init_intel, - .c_identify = generic_identify, .c_size_cache = intel_size_cache, }; diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index c8547a6fa7e6..5c43be47587f 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -4,6 +4,7 @@ * Changes: * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. + * Andi Kleen : CPUID4 emulation on AMD. */ #include <linux/init.h> @@ -130,25 +131,111 @@ struct _cpuid4_info { cpumask_t shared_cpu_map; }; -static unsigned short num_cache_leaves; +unsigned short num_cache_leaves; + +/* AMD doesn't have CPUID4. Emulate it here to report the same + information to the user. This makes some assumptions about the machine: + No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs. + + In theory the TLBs could be reported as fake type (they are in "dummy"). + Maybe later */ +union l1_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 8; + unsigned assoc : 8; + unsigned size_in_kb : 8; + }; + unsigned val; +}; + +union l2_cache { + struct { + unsigned line_size : 8; + unsigned lines_per_tag : 4; + unsigned assoc : 4; + unsigned size_in_kb : 16; + }; + unsigned val; +}; + +static const unsigned short assocs[] = { + [1] = 1, [2] = 2, [4] = 4, [6] = 8, + [8] = 16, + [0xf] = 0xffff // ?? + }; +static const unsigned char levels[] = { 1, 1, 2 }; +static const unsigned char types[] = { 1, 2, 3 }; + +static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) +{ + unsigned dummy; + unsigned line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d; + union l2_cache l2; + + eax->full = 0; + ebx->full = 0; + ecx->full = 0; + + cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); + cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy); + + if (leaf > 2 || !l1d.val || !l1i.val || !l2.val) + return; + + eax->split.is_self_initializing = 1; + eax->split.type = types[leaf]; + eax->split.level = levels[leaf]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; + + if (leaf <= 1) { + union l1_cache *l1 = leaf == 0 ? &l1d : &l1i; + assoc = l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; + } else { + assoc = l2.assoc; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + /* cpu_data has errata corrections for K7 applied */ + size_in_kb = current_cpu_data.x86_cache_size; + } + + if (assoc == 0xf) + eax->split.is_fully_associative = 1; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assocs[assoc] - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; +} static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { - unsigned int eax, ebx, ecx, edx; - union _cpuid4_leaf_eax cache_eax; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + unsigned edx; - cpuid_count(4, index, &eax, &ebx, &ecx, &edx); - cache_eax.full = eax; - if (cache_eax.split.type == CACHE_TYPE_NULL) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + amd_cpuid4(index, &eax, &ebx, &ecx); + else + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ - this_leaf->eax.full = eax; - this_leaf->ebx.full = ebx; - this_leaf->ecx.full = ecx; - this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) * - (this_leaf->ebx.split.coherency_line_size + 1) * - (this_leaf->ebx.split.physical_line_partition + 1) * - (this_leaf->ebx.split.ways_of_associativity + 1); + this_leaf->eax = eax; + this_leaf->ebx = ebx; + this_leaf->ecx = ecx; + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } @@ -174,7 +261,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data); #endif @@ -296,14 +383,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) if (new_l2) { l2 = new_l2; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT cpu_llc_id[cpu] = l2_id; #endif } if (new_l3) { l3 = new_l3; -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT cpu_llc_id[cpu] = l3_id; #endif } @@ -642,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev) return; } -static int cacheinfo_cpu_callback(struct notifier_block *nfb, +static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -660,7 +747,7 @@ static int cacheinfo_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block cacheinfo_cpu_notifier = +static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier = { .notifier_call = cacheinfo_cpu_callback, }; @@ -672,7 +759,7 @@ static int __cpuinit cache_sysfs_init(void) if (num_cache_leaves == 0) return 0; - register_cpu_notifier(&cacheinfo_cpu_notifier); + register_hotcpu_notifier(&cacheinfo_cpu_notifier); for_each_online_cpu(i) { cacheinfo_cpu_callback(&cacheinfo_cpu_notifier, CPU_ONLINE, diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile index 30808f3d6715..f1ebe1c1c17a 100644 --- a/arch/i386/kernel/cpu/mcheck/Makefile +++ b/arch/i386/kernel/cpu/mcheck/Makefile @@ -1,2 +1,2 @@ -obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index fc5d5215e23d..b0862af595aa 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/config.h> #include <linux/interrupt.h> #include <linux/smp.h> diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index afa0888f9a1e..d555bec0db99 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/config.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/thread_info.h> diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h index dc2416dfef15..84fd4cf7d0fb 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.h +++ b/arch/i386/kernel/cpu/mcheck/mce.h @@ -9,6 +9,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); -extern int mce_disabled __initdata; +extern int mce_disabled; extern int nr_mce_banks; diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c index 82dffe0d4954..1f9153ae5b03 100644 --- a/arch/i386/kernel/cpu/mcheck/non-fatal.c +++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> -#include <linux/config.h> #include <linux/workqueue.h> #include <linux/interrupt.h> #include <linux/smp.h> diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index fd2c459a31ef..504434a46011 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -5,7 +5,6 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/kernel.h> -#include <linux/config.h> #include <linux/interrupt.h> #include <linux/smp.h> @@ -14,6 +13,8 @@ #include <asm/msr.h> #include <asm/apic.h> +#include <asm/therm_throt.h> + #include "mce.h" /* as supported by the P4/Xeon family */ @@ -45,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs) /* P4/Xeon Thermal transition interrupt handler */ static void intel_thermal_interrupt(struct pt_regs *regs) { - u32 l, h; - unsigned int cpu = smp_processor_id(); - static unsigned long next[NR_CPUS]; + __u64 msr_val; ack_APIC_irq(); - if (time_after(next[cpu], jiffies)) - return; - - next[cpu] = jiffies + HZ*5; - rdmsr(MSR_IA32_THERM_STATUS, l, h); - if (l & 0x1) { - printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); - printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", - cpu); - add_taint(TAINT_MACHINE_CHECK); - } else { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); - } + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); } /* Thermal interrupt handler for this CPU setup */ @@ -123,10 +111,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) rdmsr (MSR_IA32_MISC_ENABLE, l, h); wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); - + l = apic_read (APIC_LVTTHMR); apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); return; } #endif /* CONFIG_X86_MCE_P4THERMAL */ diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 000000000000..4f43047de406 --- /dev/null +++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,180 @@ +/* + * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ + +#include <linux/percpu.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <asm/cpu.h> +#include <linux/notifier.h> +#include <asm/therm_throt.h> + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + __u64 tmp_jiffs = get_jiffies_64(); + + if (curr) + __get_cpu_var(thermal_throttle_count)++; + + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev) +{ + sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev) +{ + sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + return 0; +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + mutex_lock(&therm_cpu_lock); + switch (action) { + case CPU_ONLINE: + thermal_throttle_add_dev(sys_dev); + break; + case CPU_DEAD: + thermal_throttle_remove_dev(sys_dev); + break; + } + mutex_unlock(&therm_cpu_lock); + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; +#endif /* CONFIG_HOTPLUG_CPU */ + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) + thermal_throttle_add_dev(get_cpu_sysdev(cpu)); +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index 169ac8e0db68..0b61eed8bbd8 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -243,7 +243,7 @@ static DEFINE_SPINLOCK(set_atomicity_lock); * has been called. */ -static void prepare_set(void) +static void prepare_set(void) __acquires(set_atomicity_lock) { unsigned long cr0; @@ -274,7 +274,7 @@ static void prepare_set(void) mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); } -static void post_set(void) +static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ __flush_tlb(); diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c index ad87fa58058d..8bf23cc80c63 100644 --- a/arch/i386/kernel/cpu/nexgen.c +++ b/arch/i386/kernel/cpu/nexgen.c @@ -10,7 +10,7 @@ * to have CPUID. (Thanks to Herbert Oppmann) */ -static int __init deep_magic_nexgen_probe(void) +static int __cpuinit deep_magic_nexgen_probe(void) { int ret; @@ -27,21 +27,20 @@ static int __init deep_magic_nexgen_probe(void) return ret; } -static void __init init_nexgen(struct cpuinfo_x86 * c) +static void __cpuinit init_nexgen(struct cpuinfo_x86 * c) { c->x86_cache_size = 256; /* A few had 1 MB... */ } -static void __init nexgen_identify(struct cpuinfo_x86 * c) +static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c) { /* Detect NexGen with old hypercode */ if ( deep_magic_nexgen_probe() ) { strcpy(c->x86_vendor_id, "NexGenDriven"); } - generic_identify(c); } -static struct cpu_dev nexgen_cpu_dev __initdata = { +static struct cpu_dev nexgen_cpu_dev __cpuinitdata = { .c_vendor = "Nexgen", .c_ident = { "NexGenDriven" }, .c_models = { diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index a19fcb262dbb..76aac088a323 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -18,7 +18,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) * applications want to get the raw CPUID data, they should access * /dev/cpu/<cpu_nr>/cpuid instead. */ - static char *x86_cap_flags[] = { + static const char * const x86_cap_flags[] = { /* Intel-defined */ "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", @@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ @@ -62,7 +62,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; - static char *x86_power_flags[] = { + static const char * const x86_power_flags[] = { "ts", /* temperature sensor */ "fid", /* frequency id control */ "vid", /* voltage id control */ @@ -109,9 +109,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); #ifdef CONFIG_X86_HT if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]); + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n])); - seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } #endif diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c index d08d5a2811c8..9317f7414989 100644 --- a/arch/i386/kernel/cpu/rise.c +++ b/arch/i386/kernel/cpu/rise.c @@ -5,7 +5,7 @@ #include "cpu.h" -static void __init init_rise(struct cpuinfo_x86 *c) +static void __cpuinit init_rise(struct cpuinfo_x86 *c) { printk("CPU: Rise iDragon"); if (c->x86_model > 2) @@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c) set_bit(X86_FEATURE_CX8, c->x86_capability); } -static struct cpu_dev rise_cpu_dev __initdata = { +static struct cpu_dev rise_cpu_dev __cpuinitdata = { .c_vendor = "Rise", .c_ident = { "RiseRiseRise" }, .c_models = { diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index 7214c9b577ab..4056fb7d2cdf 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -5,7 +5,7 @@ #include <asm/msr.h> #include "cpu.h" -static void __init init_transmeta(struct cpuinfo_x86 *c) +static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; @@ -85,10 +85,9 @@ static void __init init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __init transmeta_identify(struct cpuinfo_x86 * c) +static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c) { u32 xlvl; - generic_identify(c); /* Transmeta-defined flags: level 0x80860001 */ xlvl = cpuid_eax(0x80860000); @@ -98,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c) } } -static struct cpu_dev transmeta_cpu_dev __initdata = { +static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_init = init_transmeta, diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c index 2cd988f6dc55..1bf3f87e9c5b 100644 --- a/arch/i386/kernel/cpu/umc.c +++ b/arch/i386/kernel/cpu/umc.c @@ -5,12 +5,8 @@ /* UMC chips appear to be only either 386 or 486, so no special init takes place. */ -static void __init init_umc(struct cpuinfo_x86 * c) -{ - -} -static struct cpu_dev umc_cpu_dev __initdata = { +static struct cpu_dev umc_cpu_dev __cpuinitdata = { .c_vendor = "UMC", .c_ident = { "UMC UMC UMC" }, .c_models = { @@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __initdata = { } }, }, - .c_init = init_umc, }; int __init umc_init_cpu(void) diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c index 1d9a4abcdfc7..fde8bea85cee 100644 --- a/arch/i386/kernel/cpuid.c +++ b/arch/i386/kernel/cpuid.c @@ -24,7 +24,6 @@ */ #include <linux/module.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/errno.h> @@ -168,6 +167,7 @@ static int cpuid_class_device_create(int i) return err; } +#ifdef CONFIG_HOTPLUG_CPU static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -183,10 +183,11 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac return NOTIFY_OK; } -static struct notifier_block cpuid_class_cpu_notifier = +static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = { .notifier_call = cpuid_class_cpu_callback, }; +#endif /* !CONFIG_HOTPLUG_CPU */ static int __init cpuid_init(void) { @@ -209,7 +210,7 @@ static int __init cpuid_init(void) if (err != 0) goto out_class; } - register_cpu_notifier(&cpuid_class_cpu_notifier); + register_hotcpu_notifier(&cpuid_class_cpu_notifier); err = 0; goto out; @@ -234,7 +235,7 @@ static void __exit cpuid_exit(void) class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); class_destroy(cpuid_class); unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); - unregister_cpu_notifier(&cpuid_class_cpu_notifier); + unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } module_init(cpuid_init); diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c index 2b0cfce24a61..67d297dc1003 100644 --- a/arch/i386/kernel/crash.c +++ b/arch/i386/kernel/crash.c @@ -22,6 +22,8 @@ #include <asm/nmi.h> #include <asm/hw_irq.h> #include <asm/apic.h> +#include <asm/kdebug.h> + #include <mach_ipi.h> @@ -90,19 +92,28 @@ static void crash_save_self(struct pt_regs *regs) crash_save_this_cpu(regs, cpu); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static atomic_t waiting_for_crash_ipi; -static int crash_nmi_callback(struct pt_regs *regs, int cpu) +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) { + struct pt_regs *regs; struct pt_regs fixed_regs; + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); /* Don't do anything if this handler is invoked on crashing cpu. * Otherwise, system will completely hang. Crashing cpu can get * an NMI if system was initially booted with nmi_watchdog parameter. */ if (cpu == crashing_cpu) - return 1; + return NOTIFY_STOP; local_irq_disable(); if (!user_mode_vm(regs)) { @@ -114,28 +125,29 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu) atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ halt(); - for(;;); + for (;;) + cpu_relax(); return 1; } -/* - * By using the NMI code instead of a vector we just sneak thru the - * word generator coming out with just what we want. AND it does - * not matter if clustered_apic_mode is set or not. - */ static void smp_send_nmi_allbutself(void) { - send_IPI_allbutself(APIC_DM_NMI); + send_IPI_allbutself(NMI_VECTOR); } +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + static void nmi_shootdown_cpus(void) { unsigned long msecs; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); /* Would it be better to replace the trap vector here? */ - set_nmi_callback(crash_nmi_callback); + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ /* Ensure the new callback function is set before sending * out the NMI */ @@ -162,7 +174,7 @@ static void nmi_shootdown_cpus(void) void machine_crash_shutdown(struct pt_regs *regs) { /* This function is only called after the system - * has paniced or is otherwise in a critical state. + * has panicked or is otherwise in a critical state. * The minimum amount of code to allow a kexec'd kernel * to run successfully needs to happen here. * diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c index 5edb1d379add..b4d14c2eb345 100644 --- a/arch/i386/kernel/doublefault.c +++ b/arch/i386/kernel/doublefault.c @@ -44,7 +44,8 @@ static void doublefault_fn(void) } } - for (;;) /* nothing */; + for (;;) + cpu_relax(); } struct tss_struct doublefault_tss __cacheline_aligned = { diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index 9202b67c4b2e..f9436989473c 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -19,7 +19,6 @@ * Skip non-WB memory and ignore empty memory ranges. */ -#include <linux/config.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/mm.h> @@ -66,7 +65,7 @@ static unsigned long efi_rt_eflags; static DEFINE_SPINLOCK(efi_rt_lock); static pgd_t efi_bak_pg_dir_pointer[2]; -static void efi_call_phys_prelog(void) +static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) { unsigned long cr4; unsigned long temp; @@ -110,7 +109,7 @@ static void efi_call_phys_prelog(void) load_gdt(cpu_gdt_descr); } -static void efi_call_phys_epilog(void) +static void efi_call_phys_epilog(void) __releases(efi_rt_lock) { unsigned long cr4; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); @@ -601,8 +600,10 @@ efi_initialize_iomem_resources(struct resource *code_resource, res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) - printk(KERN_ERR PFX "Failed to allocate res %s : 0x%lx-0x%lx\n", - res->name, res->start, res->end); + printk(KERN_ERR PFX "Failed to allocate res %s : " + "0x%llx-0x%llx\n", res->name, + (unsigned long long)res->start, + (unsigned long long)res->end); /* * We don't know which region contains kernel data so we try * it repeatedly and let the resource manager test it. diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S index 08c0312d9b6c..ef00bb77d7e4 100644 --- a/arch/i386/kernel/efi_stub.S +++ b/arch/i386/kernel/efi_stub.S @@ -5,10 +5,8 @@ * turned off. */ -#include <linux/config.h> #include <linux/linkage.h> #include <asm/page.h> -#include <asm/pgtable.h> /* * efi_call_phys(void *, ...) is a function with variable parameters. diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index cfc683f153b9..5a63d6fdb70e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -40,14 +40,15 @@ * "current" is in register %ebx during any slow entries. */ -#include <linux/config.h> #include <linux/linkage.h> #include <asm/thread_info.h> +#include <asm/irqflags.h> #include <asm/errno.h> #include <asm/segment.h> #include <asm/smp.h> #include <asm/page.h> #include <asm/desc.h> +#include <asm/dwarf2.h> #include "irq_vectors.h" #define nr_syscalls ((syscall_table_size)/4) @@ -75,41 +76,99 @@ DF_MASK = 0x00000400 NT_MASK = 0x00004000 VM_MASK = 0x00020000 +/* These are replaces for paravirtualization */ +#define DISABLE_INTERRUPTS cli +#define ENABLE_INTERRUPTS sti +#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit +#define INTERRUPT_RETURN iret +#define GET_CR0_INTO_EAX movl %cr0, %eax + #ifdef CONFIG_PREEMPT -#define preempt_stop cli +#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF #else #define preempt_stop #define resume_kernel restore_nocheck #endif +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $IF_MASK,EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + #define SAVE_ALL \ cld; \ pushl %es; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET es, 0;*/\ pushl %ds; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET ds, 0;*/\ pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET eax, 0;\ pushl %ebp; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebp, 0;\ pushl %edi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edi, 0;\ pushl %esi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET esi, 0;\ pushl %edx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edx, 0;\ pushl %ecx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ecx, 0;\ pushl %ebx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ movl %edx, %es; #define RESTORE_INT_REGS \ popl %ebx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebx;\ popl %ecx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ecx;\ popl %edx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edx;\ popl %esi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE esi;\ popl %edi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edi;\ popl %ebp; \ - popl %eax + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebp;\ + popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE eax #define RESTORE_REGS \ RESTORE_INT_REGS; \ 1: popl %ds; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE ds;*/\ 2: popl %es; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE es;*/\ .section .fixup,"ax"; \ 3: movl $0,(%esp); \ jmp 1b; \ @@ -122,13 +181,50 @@ VM_MASK = 0x00020000 .long 2b,4b; \ .previous +#define RING0_INT_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, 3*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, 4*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, OLDESP-EBX;\ + /*CFI_OFFSET cs, CS-OLDESP;*/\ + CFI_OFFSET eip, EIP-OLDESP;\ + /*CFI_OFFSET es, ES-OLDESP;*/\ + /*CFI_OFFSET ds, DS-OLDESP;*/\ + CFI_OFFSET eax, EAX-OLDESP;\ + CFI_OFFSET ebp, EBP-OLDESP;\ + CFI_OFFSET edi, EDI-OLDESP;\ + CFI_OFFSET esi, ESI-OLDESP;\ + CFI_OFFSET edx, EDX-OLDESP;\ + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP ENTRY(ret_from_fork) + CFI_STARTPROC pushl %eax + CFI_ADJUST_CFA_OFFSET 4 call schedule_tail GET_THREAD_INFO(%ebp) popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 jmp syscall_exit + CFI_ENDPROC /* * Return to user mode is not as complex as all this looks, @@ -139,16 +235,19 @@ ENTRY(ret_from_fork) # userspace resumption stub bypassing syscall exit tracing ALIGN + RING0_PTREGS_FRAME ret_from_exception: preempt_stop ret_from_intr: GET_THREAD_INFO(%ebp) +check_userspace: movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al - testl $(VM_MASK | 3), %eax - jz resume_kernel + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret movl TI_flags(%ebp), %ecx @@ -159,7 +258,7 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) - cli + DISABLE_INTERRUPTS cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: @@ -171,20 +270,43 @@ need_resched: call preempt_schedule_irq jmp need_resched #endif + CFI_ENDPROC /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub ENTRY(sysenter_entry) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp movl TSS_sysenter_esp0(%esp),%esp sysenter_past_esp: - sti + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ + ENABLE_INTERRUPTS pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 pushfl + CFI_ADJUST_CFA_OFFSET 4 pushl $(__USER_CS) - pushl $SYSENTER_RETURN + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 /* * Load the potential sixth argument from user stack. @@ -199,6 +321,7 @@ sysenter_past_esp: .previous pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) @@ -209,7 +332,8 @@ sysenter_past_esp: jae syscall_badsys call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) - cli + DISABLE_INTERRUPTS + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work @@ -217,13 +341,16 @@ sysenter_past_esp: movl EIP(%esp), %edx movl OLDESP(%esp), %ecx xorl %ebp,%ebp - sti - sysexit + TRACE_IRQS_ON + ENABLE_INTERRUPTS_SYSEXIT + CFI_ENDPROC # system call handler stub ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) testl $TF_MASK,EFLAGS(%esp) @@ -240,9 +367,10 @@ syscall_call: call *sys_call_table(,%eax,4) movl %eax,EAX(%esp) # store the return value syscall_exit: - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work @@ -254,16 +382,21 @@ restore_all: # See comments in process.c:copy_thread() for details. movb OLDSS(%esp), %ah movb CS(%esp), %al - andl $(VM_MASK | (4 << 8) | 3), %eax - cmpl $((4 << 8) | 3), %eax + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS restore_nocheck: + TRACE_IRQS_IRET +restore_nocheck_notrace: RESTORE_REGS addl $4, %esp -1: iret + CFI_ADJUST_CFA_OFFSET -4 +1: INTERRUPT_RETURN .section .fixup,"ax" iret_exc: - sti + TRACE_IRQS_ON + ENABLE_INTERRUPTS pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -273,6 +406,7 @@ iret_exc: .long 1b,iret_exc .previous + CFI_RESTORE_STATE ldt_ss: larl OLDSS(%esp), %eax jnz restore_nocheck @@ -285,29 +419,36 @@ ldt_ss: * CPUs, which we can try to work around to make * dosemu and wine happy. */ subl $8, %esp # reserve space for switch16 pointer - cli + CFI_ADJUST_CFA_OFFSET 8 + DISABLE_INTERRUPTS + TRACE_IRQS_OFF movl %esp, %eax /* Set up the 16bit stack frame with switch32 pointer on top, * and a switch16 pointer on top of the current frame. */ call setup_x86_bogus_stack + CFI_ADJUST_CFA_OFFSET -8 # frame has moved + TRACE_IRQS_IRET RESTORE_REGS lss 20+4(%esp), %esp # switch to 16bit stack -1: iret +1: INTERRUPT_RETURN .section __ex_table,"a" .align 4 .long 1b,iret_exc .previous + CFI_ENDPROC # perform work that needs to be done immediately before resumption ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: call schedule - cli # make sure we don't miss an interrupt + DISABLE_INTERRUPTS # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? @@ -323,18 +464,20 @@ work_notifysig: # deal with pending signals and # vm86-space xorl %edx, %edx call do_notify_resume - jmp resume_userspace + jmp resume_userspace_sig ALIGN work_notifysig_v86: #ifdef CONFIG_VM86 pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 call save_v86_state # %eax contains pt_regs pointer popl %ecx + CFI_ADJUST_CFA_OFFSET -4 movl %eax, %esp xorl %edx, %edx call do_notify_resume - jmp resume_userspace + jmp resume_userspace_sig #endif # perform syscall exit tracing @@ -357,25 +500,28 @@ syscall_trace_entry: syscall_exit_work: testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl jz work_pending - sti # could let do_syscall_trace() call + TRACE_IRQS_ON + ENABLE_INTERRUPTS # could let do_syscall_trace() call # schedule() instead movl %esp, %eax movl $1, %edx call do_syscall_trace jmp resume_userspace + CFI_ENDPROC - ALIGN + RING0_INT_FRAME # can't unwind into user space anyway syscall_fault: pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) movl $-EFAULT,EAX(%esp) jmp resume_userspace - ALIGN syscall_badsys: movl $-ENOSYS,EAX(%esp) jmp resume_userspace + CFI_ENDPROC #define FIXUP_ESPFIX_STACK \ movl %esp, %eax; \ @@ -387,16 +533,21 @@ syscall_badsys: movl %eax, %esp; #define UNWIND_ESPFIX_STACK \ pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4; \ movl %ss, %eax; \ /* see if on 16bit stack */ \ cmpw $__ESPFIX_SS, %ax; \ - jne 28f; \ - movl $__KERNEL_DS, %edx; \ - movl %edx, %ds; \ - movl %edx, %es; \ + je 28f; \ +27: popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4; \ +.section .fixup,"ax"; \ +28: movl $__KERNEL_DS, %eax; \ + movl %eax, %ds; \ + movl %eax, %es; \ /* switch to 32bit stack */ \ - FIXUP_ESPFIX_STACK \ -28: popl %eax; + FIXUP_ESPFIX_STACK; \ + jmp 27b; \ +.previous /* * Build the entry stubs and pointer table with @@ -408,9 +559,14 @@ ENTRY(interrupt) vector=0 ENTRY(irq_entries_start) + RING0_INT_FRAME .rept NR_IRQS ALIGN -1: pushl $vector-256 + .if vector + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $~(vector) + CFI_ADJUST_CFA_OFFSET 4 jmp common_interrupt .data .long 1b @@ -418,68 +574,112 @@ ENTRY(irq_entries_start) vector=vector+1 .endr +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ ALIGN common_interrupt: SAVE_ALL + TRACE_IRQS_OFF movl %esp,%eax call do_IRQ jmp ret_from_intr + CFI_ENDPROC #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ - pushl $nr-256; \ - SAVE_ALL \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ movl %esp,%eax; \ call smp_/**/name; \ - jmp ret_from_intr; + jmp ret_from_intr; \ + CFI_ENDPROC /* The include is where all of the SMP etc. interrupts come from */ #include "entry_arch.h" -ENTRY(divide_error) - pushl $0 # no error code - pushl $do_divide_error +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 xorl %eax, %eax pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 decl %eax # eax = -1 pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 cld pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ UNWIND_ESPFIX_STACK popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ movl ES(%esp), %edi # get the function address movl ORIG_EAX(%esp), %edx # get the error code movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) + /*CFI_REL_OFFSET es, ES*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception + CFI_ENDPROC +KPROBE_END(page_fault) ENTRY(coprocessor_error) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(simd_coprocessor_error) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(device_not_available) + RING0_INT_FRAME pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL - movl %cr0, %eax + GET_CR0_INTO_EAX testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop @@ -487,9 +687,12 @@ ENTRY(device_not_available) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP + CFI_ADJUST_CFA_OFFSET 4 call math_emulate addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 jmp ret_from_exception + CFI_ENDPROC /* * Debug traps and NMI can happen at the one SYSENTER instruction @@ -509,22 +712,32 @@ device_not_available_emulate: jne ok; \ label: \ movl TSS_sysenter_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 KPROBE_ENTRY(debug) + RING0_INT_FRAME cmpl $sysenter_entry,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug jmp ret_from_exception - .previous .text + CFI_ENDPROC +KPROBE_END(debug) + /* * NMI is doubly nasty. It can happen _while_ we're handling * a debug fault, and the debug fault hasn't yet been able to @@ -533,15 +746,19 @@ debug_stack_correct: * check whether we got an NMI on the debug path where the debug * fault happened on the sysenter path. */ -ENTRY(nmi) +KPROBE_ENTRY(nmi) + RING0_INT_FRAME pushl %eax + CFI_ADJUST_CFA_OFFSET 4 movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax + CFI_ADJUST_CFA_OFFSET -4 je nmi_16bit_stack cmpl $sysenter_entry,(%esp) je nmi_stack_fixup pushl %eax + CFI_ADJUST_CFA_OFFSET 4 movl %esp,%eax /* Do not access memory above the end of our stack page, * it might not exist. @@ -549,21 +766,28 @@ ENTRY(nmi) andl $(THREAD_SIZE-1),%eax cmpl $(THREAD_SIZE-20),%eax popl %eax + CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct cmpl $sysenter_entry,12(%esp) je nmi_debug_stack_check nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi - jmp restore_all + jmp restore_nocheck_notrace + CFI_ENDPROC nmi_stack_fixup: + RING0_INT_FRAME FIX_STACK(12,nmi_stack_correct, 1) jmp nmi_stack_correct + nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ cmpw $__KERNEL_CS,16(%esp) jne nmi_stack_correct cmpl $debug,(%esp) @@ -574,94 +798,194 @@ nmi_debug_stack_check: jmp nmi_stack_correct nmi_16bit_stack: - /* create the pointer to lss back */ + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ pushl %ss + CFI_ADJUST_CFA_OFFSET 4 pushl %esp + CFI_ADJUST_CFA_OFFSET 4 movzwl %sp, %esp addw $4, (%esp) /* copy the iret frame of 12 bytes */ .rept 3 pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 .endr pushl %eax + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL FIXUP_ESPFIX_STACK # %eax == %esp + CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved xorl %edx,%edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to 16bit stack -1: iret +1: INTERRUPT_RETURN + CFI_ENDPROC .section __ex_table,"a" .align 4 .long 1b,iret_exc .previous +KPROBE_END(nmi) KPROBE_ENTRY(int3) + RING0_INT_FRAME pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 jmp ret_from_exception - .previous .text + CFI_ENDPROC +KPROBE_END(int3) ENTRY(overflow) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(bounds) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(invalid_op) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(invalid_TSS) + RING0_EC_FRAME pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(segment_not_present) + RING0_EC_FRAME pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC ENTRY(stack_segment) + RING0_EC_FRAME pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC KPROBE_ENTRY(general_protection) + RING0_EC_FRAME pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 jmp error_code - .previous .text + CFI_ENDPROC +KPROBE_END(general_protection) ENTRY(alignment_check) + RING0_EC_FRAME pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC -KPROBE_ENTRY(page_fault) - pushl $do_page_fault +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 jmp error_code - .previous .text + CFI_ENDPROC #ifdef CONFIG_X86_MCE ENTRY(machine_check) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC #endif ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME pushl $0 + CFI_ADJUST_CFA_OFFSET 4 pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 jmp error_code + CFI_ENDPROC + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, EBX(%edx) + xorl %ebx, %ebx + movl %ebx, ECX(%edx) + movl %ebx, EDX(%edx) + movl %esi, ESI(%edx) + movl %edi, EDI(%edx) + movl %ebp, EBP(%edx) + movl %ebx, EAX(%edx) + movl $__USER_DS, DS(%edx) + movl $__USER_DS, ES(%edx) + movl %ebx, ORIG_EAX(%edx) + movl %ecx, EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, CS(%edx) + movl %ebx, EFLAGS(%edx) + movl %eax, OLDESP(%edx) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl EBX(%edx), %ebx + movl $__KERNEL_DS, OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + CFI_ENDPROC +ENDPROC(kernel_thread_helper) .section .rodata,"a" #include "syscall_table.S" diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index 3debc2e26542..be9d883c62ce 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -8,7 +8,6 @@ */ .text -#include <linux/config.h> #include <linux/threads.h> #include <linux/linkage.h> #include <asm/segment.h> @@ -318,20 +317,14 @@ is386: movl $2,%ecx # set MP movl %eax,%gs lldt %ax cld # gcc2 wants the direction flag cleared at all times + pushl %eax # fake return address #ifdef CONFIG_SMP movb ready, %cl movb $1, ready - cmpb $0,%cl - je 1f # the first CPU calls start_kernel - # all other CPUs call initialize_secondary - call initialize_secondary - jmp L6 -1: + cmpb $0,%cl # the first CPU calls start_kernel + jne initialize_secondary # all other CPUs call initialize_secondary #endif /* CONFIG_SMP */ - call start_kernel -L6: - jmp L6 # main should never return here, but - # just in case, we know what happens. + jmp start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -378,8 +371,65 @@ rp_sidt: addl $8,%edi dec %ecx jne rp_sidt + +.macro set_early_handler handler,trapno + lea \handler,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + lea idt_table,%edi + movl %eax,8*\trapno(%edi) + movl %edx,8*\trapno+4(%edi) +.endm + + set_early_handler handler=early_divide_err,trapno=0 + set_early_handler handler=early_illegal_opcode,trapno=6 + set_early_handler handler=early_protection_fault,trapno=13 + set_early_handler handler=early_page_fault,trapno=14 + ret +early_divide_err: + xor %edx,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_illegal_opcode: + movl $6,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_protection_fault: + movl $13,%edx + jmp early_fault + +early_page_fault: + movl $14,%edx + jmp early_fault + +early_fault: + cld +#ifdef CONFIG_PRINTK + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + movl %cr2,%eax + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg +#ifdef CONFIG_EARLY_PRINTK + call early_printk +#else + call printk +#endif +#endif +hlt_loop: + hlt + jmp hlt_loop + /* This is the default interrupt "handler" :-) */ ALIGN ignore_int: @@ -393,6 +443,9 @@ ignore_int: movl $(__KERNEL_DS),%eax movl %eax,%ds movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag pushl 16(%esp) pushl 24(%esp) pushl 32(%esp) @@ -438,9 +491,16 @@ ENTRY(stack_start) ready: .byte 0 +early_recursion_flag: + .long 0 + int_msg: .asciz "Unknown interrupt or fault at EIP %p %p %p\n" +fault_msg: + .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" + .asciz "Stack: %p %p %p %p %p %p %p %p\n" + /* * The IDT and GDT 'descriptors' are a strange 48-bit object * only used by the lidt and lgdt instructions. They are not diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c new file mode 100644 index 000000000000..17647a530b2f --- /dev/null +++ b/arch/i386/kernel/hpet.c @@ -0,0 +1,67 @@ +#include <linux/clocksource.h> +#include <linux/errno.h> +#include <linux/hpet.h> +#include <linux/init.h> + +#include <asm/hpet.h> +#include <asm/io.h> + +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem* hpet_base; + u64 tmp; + + if (!is_hpet_enabled()) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 036a9857936f..e3d4b73bfdb0 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -1,4 +1,3 @@ -#include <linux/config.h> #include <linux/module.h> #include <asm/checksum.h> #include <asm/desc.h> diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c index c4351972d9af..665847281ed2 100644 --- a/arch/i386/kernel/i387.c +++ b/arch/i386/kernel/i387.c @@ -8,7 +8,6 @@ * Gareth Hughes <gareth@valinux.com>, May 2000 */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/module.h> #include <asm/processor.h> diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c new file mode 100644 index 000000000000..477b24daff53 --- /dev/null +++ b/arch/i386/kernel/i8253.c @@ -0,0 +1,118 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include <linux/clocksource.h> +#include <linux/spinlock.h> +#include <linux/jiffies.h> +#include <linux/sysdev.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/smp.h> +#include <asm/delay.h> +#include <asm/i8253.h> +#include <asm/io.h> + +#include "io_ports.h" + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +void setup_pit_timer(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ + unsigned long flags; + int count; + u32 jifs; + static int old_count; + static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) { + count = old_count; + } + old_count = count; + old_jifs = jifs; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = (LATCH - 1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + if (num_possible_cpus() > 4) /* PIT does not scale! */ + return 0; + + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + return clocksource_register(&clocksource_pit); +} +module_init(init_pit_clocksource); diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index b7636b96e104..ea5f4e7958d8 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -1,4 +1,3 @@ -#include <linux/config.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> @@ -46,6 +45,8 @@ static void end_8259A_irq (unsigned int irq) #define shutdown_8259A_irq disable_8259A_irq +static int i8259A_auto_eoi; + static void mask_and_ack_8259A(unsigned int); unsigned int startup_8259A_irq(unsigned int irq) @@ -132,7 +133,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); - irq_desc[irq].handler = &i8259A_irq_type; + irq_desc[irq].chip = &i8259A_irq_type; enable_irq(irq); } @@ -175,7 +176,7 @@ static void mask_and_ack_8259A(unsigned int irq) * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign * of hardware problems, so we only do the checks we can - * do without slowing down good hardware unnecesserily. + * do without slowing down good hardware unnecessarily. * * Note that IRQ7 and IRQ15 (the two spurious IRQs * usually resulting from the 8259A-1|2 PICs) occur @@ -254,7 +255,7 @@ static void save_ELCR(char *trigger) static int i8259A_resume(struct sys_device *dev) { - init_8259A(0); + init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); return 0; } @@ -302,6 +303,8 @@ void init_8259A(int auto_eoi) { unsigned long flags; + i8259A_auto_eoi = auto_eoi; + spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ @@ -386,12 +389,12 @@ void __init init_ISA_irqs (void) /* * 16 old-style INTA-cycle interrupts: */ - irq_desc[i].handler = &i8259A_irq_type; + irq_desc[i].chip = &i8259A_irq_type; } else { /* * 'high' PCI IRQs filled in on demand */ - irq_desc[i].handler = &no_irq_type; + irq_desc[i].chip = &no_irq_type; } } } diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index a62df3e764c5..fd0df75cfbda 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -25,7 +25,6 @@ #include <linux/init.h> #include <linux/delay.h> #include <linux/sched.h> -#include <linux/config.h> #include <linux/smp_lock.h> #include <linux/mc146818rtc.h> #include <linux/compiler.h> @@ -38,8 +37,10 @@ #include <asm/desc.h> #include <asm/timer.h> #include <asm/i8259.h> +#include <asm/nmi.h> #include <mach_apic.h> +#include <mach_apicdef.h> #include "io_ports.h" @@ -50,6 +51,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -64,7 +66,7 @@ int sis_apic_bug = -1; */ int nr_ioapic_registers[MAX_IO_APICS]; -int disable_timer_pin_1 __initdata; +static int disable_timer_pin_1 __initdata; /* * Rough estimation of how many shared IRQs there are, can @@ -92,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; #define vector_to_irq(vector) (vector) #endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + union entry_union eu; + eu.entry = e; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -199,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq) static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) { struct IO_APIC_route_entry entry; - unsigned long flags; /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; @@ -214,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) */ memset(&entry, 0, sizeof(entry)); entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); } static void clear_IO_APIC (void) @@ -579,7 +602,7 @@ static int balanced_irq(void *unused) /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < NR_IRQS ; i++) { - pending_irq_cpumask[i] = cpumask_of_cpu(0); + irq_desc[i].pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -1161,10 +1184,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; int assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + unsigned long flags; + int vector; - BUG_ON(irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) + BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + + spin_lock_irqsave(&vector_lock, flags); + + if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { + spin_unlock_irqrestore(&vector_lock, flags); return IO_APIC_VECTOR(irq); + } next: current_vector += 8; if (current_vector == SYSCALL_VECTOR) @@ -1172,16 +1202,21 @@ next: if (current_vector >= FIRST_SYSTEM_VECTOR) { offset++; - if (!(offset%8)) + if (!(offset%8)) { + spin_unlock_irqrestore(&vector_lock, flags); return -ENOSPC; + } current_vector = FIRST_DEVICE_VECTOR + offset; } - vector_irq[current_vector] = irq; + vector = current_vector; + vector_irq[vector] = irq; if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; + IO_APIC_VECTOR(irq) = vector; - return current_vector; + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; } static struct hw_interrupt_type ioapic_level_type; @@ -1191,23 +1226,18 @@ static struct hw_interrupt_type ioapic_edge_type; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } + unsigned idx; + + idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + irq_desc[idx].chip = &ioapic_level_type; + else + irq_desc[idx].chip = &ioapic_edge_type; + set_intr_gate(vector, interrupt[idx]); } static void __init setup_IO_APIC_irqs(void) @@ -1275,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void) if (!apic && (irq < 16)) disable_8259A_irq(irq); } + ioapic_write_entry(apic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1293,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void) static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry,0,sizeof(entry)); @@ -1318,15 +1346,12 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].handler = &ioapic_edge_type; + irq_desc[0].chip = &ioapic_edge_type; /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry); enable_8259A_irq(0); } @@ -1436,10 +1461,7 @@ void __init print_IO_APIC(void) for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, i); printk(KERN_DEBUG " %02x %03X %02X ", i, @@ -1658,10 +1680,7 @@ static void __init enable_IO_APIC(void) /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { struct IO_APIC_route_entry entry; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry = ioapic_read_entry(apic, pin); /* If the interrupt line is enabled and in ExtInt mode @@ -1718,7 +1737,6 @@ void disable_IO_APIC(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; - unsigned long flags; memset(&entry, 0, sizeof(entry)); entry.mask = 0; /* Enabled */ @@ -1735,12 +1753,7 @@ void disable_IO_APIC(void) /* * Add it to the IO-APIC irq-routing table: */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin, - *(((int *)&entry)+1)); - io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin, - *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } disconnect_bsp_APIC(ioapic_i8259.pin != -1); } @@ -2062,6 +2075,13 @@ static void set_ioapic_affinity_vector (unsigned int vector, #endif #endif +static int ioapic_retrigger(unsigned int irq) +{ + send_IPI_self(IO_APIC_VECTOR(irq)); + + return 1; +} + /* * Level and edge triggered IO-APIC interrupts need different handling, * so we use two separate IRQ descriptors. Edge triggered IRQs can be @@ -2081,6 +2101,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = { #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity, #endif + .retrigger = ioapic_retrigger, }; static struct hw_interrupt_type ioapic_level_type __read_mostly = { @@ -2094,6 +2115,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = { #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity, #endif + .retrigger = ioapic_retrigger, }; static inline void init_IO_APIC_traps(void) @@ -2128,7 +2150,7 @@ static inline void init_IO_APIC_traps(void) make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].handler = &no_irq_type; + irq_desc[irq].chip = &no_irq_type; } } } @@ -2196,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; - unsigned long flags; pin = find_isa_irq_pin(8, mp_INT); apic = find_isa_irq_apic(8, mp_INT); if (pin == -1) return; - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2219,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void) entry1.trigger = 0; entry1.vector = 0; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry1); save_control = CMOS_READ(RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); @@ -2241,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void) CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); clear_IO_APIC_pin(apic, pin); - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); + ioapic_write_entry(apic, pin, entry0); } int timer_uses_ioapic_pin_0; @@ -2344,7 +2356,7 @@ static inline void check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(0); - irq_desc[0].handler = &lapic_irq_type; + irq_desc[0].chip = &lapic_irq_type; apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2444,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { struct IO_APIC_route_entry *entry; struct sysfs_ioapic_data *data; - unsigned long flags; int i; data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + entry[i] = ioapic_read_entry(dev->id, i); return 0; } @@ -2476,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev) reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; io_apic_write(dev->id, 0, reg_00.raw); } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + ioapic_write_entry(dev->id, i, entry[i]); return 0; } @@ -2677,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a if (!ioapic && (irq < 16)) disable_8259A_irq(irq); + ioapic_write_entry(ioapic, pin, entry); spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2687,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a } #endif /* CONFIG_ACPI */ + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +static int __init parse_enable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = -1; + return 0; +} +early_param("enable_timer_pin_1", parse_enable_timer_pin_1); + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 79026f026b85..498e8bc197d5 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -79,6 +79,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) memset(bitmap, 0xff, IO_BITMAP_BYTES); t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); } /* diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 248e922ee13a..5fe547cd8f9f 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -53,13 +53,19 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; */ fastcall unsigned int do_IRQ(struct pt_regs *regs) { - /* high bits used in ret_from_ code */ - int irq = regs->orig_eax & 0xff; + /* high bit used in ret_from_ code */ + int irq = ~regs->orig_eax; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; #endif + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } + irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ @@ -95,6 +101,14 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) irqctx->tinfo.task = curctx->tinfo.task; irqctx->tinfo.previous_esp = current_stack_pointer; + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + asm volatile( " xchgl %%ebx,%%esp \n" " call __do_IRQ \n" @@ -147,7 +161,7 @@ void irq_ctx_init(int cpu) irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; - irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET; + irqctx->tinfo.preempt_count = 0; irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); softirq_ctx[cpu] = irqctx; @@ -192,6 +206,10 @@ asmlinkage void do_softirq(void) : "0"(isp) : "memory", "cc", "edx", "ecx", "eax" ); + /* + * Shouldnt happen, we returned above if in_interrupt(): + */ + WARN_ON_ONCE(softirq_count()); } local_irq_restore(flags); @@ -219,7 +237,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%d ",j); + seq_printf(p, "CPU%-8d",j); seq_putc(p, '\n'); } @@ -235,7 +253,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %14s", irq_desc[i].chip->typename); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -277,13 +295,13 @@ void fixup_irqs(cpumask_t map) if (irq == 2) continue; - cpus_and(mask, irq_affinity[irq], map); + cpus_and(mask, irq_desc[irq].affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].handler->set_affinity) - irq_desc[irq].handler->set_affinity(irq, mask); + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); else if (irq_desc[irq].action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 395a9a6dff88..afe6505ca0b3 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -28,7 +28,6 @@ * <prasanna@in.ibm.com> added function-return probes. */ -#include <linux/config.h> #include <linux/kprobes.h> #include <linux/ptrace.h> #include <linux/preempt.h> @@ -57,34 +56,85 @@ static __always_inline void set_jmp_op(void *from, void *to) /* * returns non-zero if opcodes can be boosted. */ -static __always_inline int can_boost(kprobe_opcode_t opcode) +static __always_inline int can_boost(kprobe_opcode_t *opcodes) { - switch (opcode & 0xf0 ) { +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not be boost. + */ + static const unsigned long twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ + W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ + W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ + W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ + W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ + W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ + W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ + W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, twobyte_is_boostable); + } + + switch (opcode & 0xf0) { + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); case 0x70: return 0; /* can't boost conditional jump */ - case 0x90: - /* can't boost call and pushf */ - return opcode != 0x9a && opcode != 0x9c; case 0xc0: - /* can't boost undefined opcodes and soft-interruptions */ - return (0xc1 < opcode && opcode < 0xc6) || - (0xc7 < opcode && opcode < 0xcc) || opcode == 0xcf; + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; case 0xd0: /* can boost AA* and XLAT */ return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); case 0xe0: - /* can boost in/out and (may be) jmps */ - return (0xe3 < opcode && opcode != 0xe8); + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ /* clear and set flags can be boost */ return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); default: - /* currently, can't boost 2 bytes opcodes */ - return opcode != 0x0f; + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* can't boost CS override and call */ + return (opcode != 0x2e && opcode != 0x9a); } } - /* * returns non-zero if opcode modifies the interrupt flag. */ @@ -109,7 +159,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); p->opcode = *p->addr; - if (can_boost(p->opcode)) { + if (can_boost(p->addr)) { p->ainsn.boostable = 0; } else { p->ainsn.boostable = -1; @@ -206,9 +256,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) int ret = 0; kprobe_opcode_t *addr; struct kprobe_ctlblk *kcb; -#ifdef CONFIG_PREEMPT - unsigned pre_preempt_count = preempt_count(); -#endif /* CONFIG_PREEMPT */ addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); @@ -285,22 +332,16 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) /* handler has already set things up, so skip ss setup */ return 1; - if (p->ainsn.boostable == 1 && -#ifdef CONFIG_PREEMPT - !(pre_preempt_count) && /* - * This enables booster when the direct - * execution path aren't preempted. - */ -#endif /* CONFIG_PREEMPT */ - !p->post_handler && !p->break_handler ) { +ss_probe: +#ifndef CONFIG_PREEMPT + if (p->ainsn.boostable == 1 && !p->post_handler){ /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return 1; } - -ss_probe: +#endif prepare_singlestep(p, regs); kcb->kprobe_status = KPROBE_HIT_SS; return 1; diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c index f73d7374a2ba..91966bafb3dc 100644 --- a/arch/i386/kernel/machine_kexec.c +++ b/arch/i386/kernel/machine_kexec.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/kexec.h> #include <linux/delay.h> +#include <linux/init.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -20,70 +21,13 @@ #include <asm/system.h> #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) - -#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define L2_ATTR (_PAGE_PRESENT) - -#define LEVEL0_SIZE (1UL << 12UL) - -#ifndef CONFIG_X86_PAE -#define LEVEL1_SIZE (1UL << 22UL) -static u32 pgtable_level1[1024] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index; - u32 *pgtable_level2; - - /* Find the current page table */ - pgtable_level2 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = address / LEVEL1_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level2); -} - -#else -#define LEVEL1_SIZE (1UL << 21UL) -#define LEVEL2_SIZE (1UL << 30UL) -static u64 pgtable_level1[512] PAGE_ALIGNED; -static u64 pgtable_level2[512] PAGE_ALIGNED; - -static void identity_map_page(unsigned long address) -{ - unsigned long level1_index, level2_index, level3_index; - u64 *pgtable_level3; - - /* Find the current page table */ - pgtable_level3 = __va(read_cr3()); - - /* Find the indexes of the physical address to identity map */ - level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; - level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; - level3_index = address / LEVEL2_SIZE; - - /* Identity map the page table entry */ - pgtable_level1[level1_index] = address | L0_ATTR; - pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; - set_64bit(&pgtable_level3[level3_index], - __pa(pgtable_level2) | L2_ATTR); - - /* Flush the tlb so the new mapping takes effect. - * Global tlb entries are not flushed but that is not an issue. - */ - load_cr3(pgtable_level3); -} +static u32 kexec_pgd[1024] PAGE_ALIGNED; +#ifdef CONFIG_X86_PAE +static u32 kexec_pmd0[1024] PAGE_ALIGNED; +static u32 kexec_pmd1[1024] PAGE_ALIGNED; #endif +static u32 kexec_pte0[1024] PAGE_ALIGNED; +static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { @@ -127,16 +71,6 @@ static void load_segments(void) #undef __STR } -typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( - unsigned long indirection_page, - unsigned long reboot_code_buffer, - unsigned long start_address, - unsigned int has_pae) ATTRIB_NORET; - -const extern unsigned char relocate_new_kernel[]; -extern void relocate_new_kernel_end(void); -const extern unsigned int relocate_new_kernel_size; - /* * A architecture hook called to validate the * proposed image and prepare the control pages @@ -169,34 +103,35 @@ void machine_kexec_cleanup(struct kimage *image) */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long reboot_code_buffer; - - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Compute some offsets */ - reboot_code_buffer = page_to_pfn(image->control_code_page) - << PAGE_SHIFT; - page_list = image->head; - - /* Set up an identity mapping for the reboot_code_buffer */ - identity_map_page(reboot_code_buffer); - - /* copy it out */ - memcpy((void *)reboot_code_buffer, relocate_new_kernel, - relocate_new_kernel_size); - - /* The segment registers are funny things, they are - * automatically loaded from a table, in memory wherever you - * set them to a specific selector, but this table is never - * accessed again you set the segment to a different selector. - * - * The more common model is are caches where the behide - * the scenes work is done, but is also dropped at arbitrary - * times. + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; +#ifdef CONFIG_X86_PAE + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +#endif + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. @@ -209,6 +144,28 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - rnk = (relocate_new_kernel_t) reboot_code_buffer; - (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); +} + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never sets it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init parse_crashkernel(char *arg) +{ + unsigned long size, base; + size = memparse(arg, &arg); + if (*arg == '@') { + base = memparse(arg+1, &arg); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; } +early_param("crashkernel", parse_crashkernel); diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c index 558bb207720f..eb57a851789d 100644 --- a/arch/i386/kernel/mca.c +++ b/arch/i386/kernel/mca.c @@ -42,11 +42,11 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mca.h> +#include <linux/kprobes.h> #include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/mman.h> -#include <linux/config.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/ioport.h> @@ -415,7 +415,8 @@ subsys_initcall(mca_init); /*--------------------------------------------------------------------*/ -static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) +static __kprobes void +mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) { int slot = mca_dev->slot; @@ -445,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) /*--------------------------------------------------------------------*/ -static int mca_handle_nmi_callback(struct device *dev, void *data) +static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) { struct mca_device *mca_dev = to_mca_device(dev); unsigned char pos5; @@ -463,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data) return 0; } -void mca_handle_nmi(void) +void __kprobes mca_handle_nmi(void) { /* First try - scan the various adapters and see if a specific * adapter was responsible for the error. diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c index 0a865889b2a9..9b9479768d5e 100644 --- a/arch/i386/kernel/microcode.c +++ b/arch/i386/kernel/microcode.c @@ -2,6 +2,7 @@ * Intel CPU Microcode Update Driver for Linux * * Copyright (C) 2000-2004 Tigran Aivazian + * 2006 Shaohua Li <shaohua.li@intel.com> * * This driver allows to upgrade microcode on Intel processors * belonging to IA-32 family - PentiumPro, Pentium II, @@ -82,6 +83,9 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/cpu.h> +#include <linux/firmware.h> +#include <linux/platform_device.h> #include <asm/msr.h> #include <asm/uaccess.h> @@ -91,9 +95,6 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>"); MODULE_LICENSE("GPL"); -static int verbose; -module_param(verbose, int, 0644); - #define MICROCODE_VERSION "1.14a" #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ @@ -120,55 +121,40 @@ static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ - -typedef enum mc_error_code { - MC_SUCCESS = 0, - MC_IGNORED = 1, - MC_NOTFOUND = 2, - MC_MARKED = 3, - MC_ALLOCATED = 4, -} mc_error_code_t; - static struct ucode_cpu_info { + int valid; unsigned int sig; - unsigned int pf, orig_pf; + unsigned int pf; unsigned int rev; - unsigned int cksum; - mc_error_code_t err; microcode_t *mc; } ucode_cpu_info[NR_CPUS]; - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} -static void collect_cpu_info (void *unused) +static void collect_cpu_info(int cpu_num) { - int cpu_num = smp_processor_id(); struct cpuinfo_x86 *c = cpu_data + cpu_num; struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; unsigned int val[2]; - uci->sig = uci->pf = uci->rev = uci->cksum = 0; - uci->err = MC_NOTFOUND; + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu_num); + uci->pf = uci->rev = 0; uci->mc = NULL; + uci->valid = 1; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num); + printk(KERN_ERR "microcode: CPU%d not a capable Intel " + "processor\n", cpu_num); + uci->valid = 0; return; - } else { - uci->sig = cpuid_eax(0x00000001); + } - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); - } - uci->orig_pf = uci->pf; + uci->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + uci->pf = 1 << ((val[1] >> 18) & 7); } wrmsr(MSR_IA32_UCODE_REV, 0, 0); @@ -180,218 +166,159 @@ static void collect_cpu_info (void *unused) uci->sig, uci->pf, uci->rev); } -static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum) +static inline int microcode_update_match(int cpu_num, + microcode_header_t *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - pr_debug("Microcode Found.\n"); - pr_debug(" Header Revision 0x%x\n", mc_header->hdrver); - pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver); - pr_debug(" Revision 0x%x \n", mc_header->rev); - pr_debug(" Date %x/%x/%x\n", - ((mc_header->date >> 24 ) & 0xff), - ((mc_header->date >> 16 ) & 0xff), - (mc_header->date & 0xFFFF)); - pr_debug(" Signature 0x%x\n", sig); - pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n", - ((sig >> 12) & 0x3), - ((sig >> 8) & 0xf), - ((sig >> 4) & 0xf), - ((sig & 0xf))); - pr_debug(" Processor Flags 0x%x\n", pf); - pr_debug(" Checksum 0x%x\n", cksum); - - if (mc_header->rev < uci->rev) { - if (uci->err == MC_NOTFOUND) { - uci->err = MC_IGNORED; - uci->cksum = mc_header->rev; - } else if (uci->err == MC_IGNORED && uci->cksum < mc_header->rev) - uci->cksum = mc_header->rev; - } else if (mc_header->rev == uci->rev) { - if (uci->err < MC_MARKED) { - /* notify the caller of success on this cpu */ - uci->err = MC_SUCCESS; - } - } else if (uci->err != MC_ALLOCATED || mc_header->rev > uci->mc->hdr.rev) { - pr_debug("microcode: CPU%d found a matching microcode update with " - " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev); - uci->cksum = cksum; - uci->pf = pf; /* keep the original mc pf for cksum calculation */ - uci->err = MC_MARKED; /* found the match */ - for_each_online_cpu(cpu_num) { - if (ucode_cpu_info + cpu_num != uci - && ucode_cpu_info[cpu_num].mc == uci->mc) { - uci->mc = NULL; - break; - } - } - if (uci->mc != NULL) { - vfree(uci->mc); - uci->mc = NULL; - } - } - return; + if (!sigmatch(sig, uci->sig, pf, uci->pf) + || mc_header->rev <= uci->rev) + return 0; + return 1; } -static int find_matching_ucodes (void) +static int microcode_sanity_check(void *mc) { - int cursor = 0; - int error = 0; - - while (cursor + MC_HEADER_SIZE < user_buffer_size) { - microcode_header_t mc_header; - void *newmc = NULL; - int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size; + microcode_header_t *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + struct extended_signature *ext_sig; + unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " + "Bad data size in microcode data file\n"); + return -EINVAL; + } - if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " + "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! " + "Small exttable size in microcode data file\n"); + return -EINVAL; } - - total_size = get_totalsize(&mc_header); - if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + printk(KERN_ERR "microcode: error! " + "Bad exttable size in microcode data file\n"); + return -EFAULT; } + ext_sigcount = ext_header->count; + } - data_size = get_datasize(&mc_header); - if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, " + "bad extended signature table checksum\n"); + return -EINVAL; } + } - if (mc_header.ldrver != 1 || mc_header.hdrver != 1) { - printk(KERN_ERR "microcode: error! Unknown microcode update format\n"); - error = -EINVAL; - goto out; + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (struct extended_signature *)((void *)ext_header + + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; } + } + return 0; +} - for_each_online_cpu(cpu_num) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->orig_pf)) - mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum); - } +/* + * return 0 - no update found + * return 1 - found update + * return < 0 - error + */ +static int get_maching_microcode(void *mc, int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + microcode_header_t *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + void *new_mc; + + if (microcode_update_match(cpu, mc_header, + mc_header->sig, mc_header->pf)) + goto find; + + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = (struct extended_sigtable *)(mc + + get_datasize(mc_header) + MC_HEADER_SIZE); + ext_sigcount = ext_header->count; + ext_sig = (struct extended_signature *)((void *)ext_header + + EXT_HEADER_SIZE); + for (i = 0; i < ext_sigcount; i++) { + if (microcode_update_match(cpu, mc_header, + ext_sig->sig, ext_sig->pf)) + goto find; + ext_sig++; + } + return 0; +find: + pr_debug("microcode: CPU %d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); + new_mc = vmalloc(total_size); + if (!new_mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - struct extended_sigtable ext_header; - struct extended_signature ext_sig; - int ext_sigcount; + /* free previous update file */ + vfree(uci->mc); - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EINVAL; - goto out; - } - if (copy_from_user(&ext_header, user_buffer + cursor - + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; - } - if (ext_table_size != exttable_size(&ext_header)) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - error = -EFAULT; - goto out; - } - - ext_sigcount = ext_header.count; - - for (i = 0; i < ext_sigcount; i++) { - if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE - + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - error = -EFAULT; - goto out; - } - for_each_online_cpu(cpu_num) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->orig_pf)) { - mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum); - } - } - } - } - /* now check if any cpu has matched */ - allocated_flag = 0; - sum = 0; - for_each_online_cpu(cpu_num) { - if (ucode_cpu_info[cpu_num].err == MC_MARKED) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (!allocated_flag) { - allocated_flag = 1; - newmc = vmalloc(total_size); - if (!newmc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - error = -ENOMEM; - goto out; - } - if (copy_from_user(newmc + MC_HEADER_SIZE, - user_buffer + cursor + MC_HEADER_SIZE, - total_size - MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(newmc); - error = -EFAULT; - goto out; - } - memcpy(newmc, &mc_header, MC_HEADER_SIZE); - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size); - i = ext_table_size / DWSIZE; - while (i--) ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n"); - vfree(newmc); - error = -EINVAL; - goto out; - } - } - - /* calculate the checksum */ - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) sum += ((int *)newmc)[i]; - sum -= (mc_header.sig + mc_header.pf + mc_header.cksum); - } - ucode_cpu_info[cpu_num].mc = newmc; - ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */ - if (sum + uci->sig + uci->pf + uci->cksum != 0) { - printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num); - error = -EINVAL; - goto out; - } - } - } - cursor += total_size; /* goto the next update patch */ - } /* end of while */ -out: - return error; + memcpy(new_mc, mc, total_size); + uci->mc = new_mc; + return 1; } -static void do_update_one (void * unused) +static void apply_microcode(int cpu) { unsigned long flags; unsigned int val[2]; - int cpu_num = smp_processor_id(); + int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (uci->mc == NULL) { - if (verbose) { - if (uci->err == MC_SUCCESS) - printk(KERN_INFO "microcode: CPU%d already at revision 0x%x\n", - cpu_num, uci->rev); - else - printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num); - } + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (uci->mc == NULL) return; - } /* serialize access to the physical write to MSR 0x79 */ spin_lock_irqsave(µcode_update_lock, flags); @@ -408,68 +335,107 @@ static void do_update_one (void * unused) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - /* notify the caller of success on this cpu */ - uci->err = MC_SUCCESS; spin_unlock_irqrestore(µcode_update_lock, flags); - printk(KERN_INFO "microcode: CPU%d updated from revision " + if (val[1] != uci->mc->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d updated from revision " + "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); + return; + } + pr_debug("microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %08x \n", cpu_num, uci->rev, val[1], uci->mc->hdr.date); - return; + uci->rev = val[1]; } -static int do_microcode_update (void) -{ - int i, error; +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static void __user *user_buffer; /* user area microcode data buffer */ +static unsigned int user_buffer_size; /* it's size */ - if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) { - printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); - error = -EIO; - goto out; +static long get_next_ucode(void **mc, long offset) +{ + microcode_header_t mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= user_buffer_size) + return 0; + if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + return -EFAULT; } - - if ((error = find_matching_ucodes())) { - printk(KERN_ERR "microcode: Error in the microcode data\n"); - goto out_free; + total_size = get_totalsize(&mc_header); + if (offset + total_size > user_buffer_size) { + printk(KERN_ERR "microcode: error! Bad total size in microcode " + "data file\n"); + return -EINVAL; } - - if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) { - printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); - error = -EIO; + *mc = vmalloc(total_size); + if (!*mc) + return -ENOMEM; + if (copy_from_user(*mc, user_buffer + offset, total_size)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(*mc); + return -EFAULT; } + return offset + total_size; +} + +static int do_microcode_update (void) +{ + long cursor = 0; + int error = 0; + void *new_mc; + int cpu; + cpumask_t old; + + old = current->cpus_allowed; -out_free: - for_each_online_cpu(i) { - if (ucode_cpu_info[i].mc) { - int j; - void *tmp = ucode_cpu_info[i].mc; - vfree(tmp); - for_each_online_cpu(j) { - if (ucode_cpu_info[j].mc == tmp) - ucode_cpu_info[j].mc = NULL; - } + while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_sanity_check(new_mc); + if (error) + goto out; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + error = get_maching_microcode(new_mc, cpu); + if (error < 0) + goto out; + if (error == 1) + apply_microcode(cpu); } - if (ucode_cpu_info[i].err == MC_IGNORED && verbose) - printk(KERN_WARNING "microcode: CPU%d not 'upgrading' to earlier revision" - " 0x%x (current=0x%x)\n", i, ucode_cpu_info[i].cksum, ucode_cpu_info[i].rev); + vfree(new_mc); } out: + if (cursor > 0) + vfree(new_mc); + if (cursor < 0) + error = cursor; + set_cpus_allowed(current, old); return error; } +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) { ssize_t ret; - if (len < DEFAULT_UCODE_TOTALSIZE) { - printk(KERN_ERR "microcode: not enough data\n"); - return -EINVAL; - } - if ((len >> PAGE_SHIFT) > num_physpages) { printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); return -EINVAL; } + lock_cpu_hotplug(); mutex_lock(µcode_mutex); user_buffer = (void __user *) buf; @@ -480,6 +446,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_ ret = (ssize_t)len; mutex_unlock(µcode_mutex); + unlock_cpu_hotplug(); return ret; } @@ -493,11 +460,10 @@ static struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devfs_name = "cpu/microcode", .fops = µcode_fops, }; -static int __init microcode_init (void) +static int __init microcode_dev_init (void) { int error; @@ -509,6 +475,280 @@ static int __init microcode_init (void) return error; } + return 0; +} + +static void __exit microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +static long get_next_ucode_from_buffer(void **mc, void *buf, + unsigned long size, long offset) +{ + microcode_header_t *mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= size) + return 0; + mc_header = (microcode_header_t *)(buf + offset); + total_size = get_totalsize(mc_header); + + if (offset + total_size > size) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return -EINVAL; + } + + *mc = vmalloc(total_size); + if (!*mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } + memcpy(*mc, buf + offset, total_size); + return offset + total_size; +} + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int cpu_request_microcode(int cpu) +{ + char name[30]; + struct cpuinfo_x86 *c = cpu_data + cpu; + const struct firmware *firmware; + void *buf; + unsigned long size; + long offset = 0; + int error; + void *mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + sprintf(name,"intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("ucode data file %s load failed\n", name); + return error; + } + buf = (void *)firmware->data; + size = firmware->size; + while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) + > 0) { + error = microcode_sanity_check(mc); + if (error) + break; + error = get_maching_microcode(mc, cpu); + if (error < 0) + break; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + if (error == 1) { + apply_microcode(cpu); + error = 0; + } + vfree(mc); + } + if (offset > 0) + vfree(mc); + if (offset < 0) + error = offset; + release_firmware(firmware); + + return error; +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + old = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + collect_cpu_info(cpu); + if (uci->valid) + cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed(current, old); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + uci->valid = 0; + vfree(uci->mc); + uci->mc = NULL; + mutex_unlock(µcode_mutex); +} + +static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old; + + old = current->cpus_allowed; + + lock_cpu_hotplug(); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + + mutex_lock(µcode_mutex); + if (uci->valid) + err = cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + unlock_cpu_hotplug(); + set_cpus_allowed(current, old); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->rev); +} + +static ssize_t pf_show(struct sys_device *dev, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + pr_debug("Microcode:CPU %d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + + microcode_init_cpu(cpu); + return 0; +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("Microcode:CPU %d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("Microcode:CPU %d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ + apply_microcode(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +#ifdef CONFIG_HOTPLUG_CPU +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + mc_sysdev_add(sys_dev); + break; + case CPU_DOWN_PREPARE: + mc_sysdev_remove(sys_dev); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; +#endif + +static int __init microcode_init (void) +{ + int error; + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + lock_cpu_hotplug(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + unlock_cpu_hotplug(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + printk(KERN_INFO "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n"); return 0; @@ -516,9 +756,16 @@ static int __init microcode_init (void) static void __exit microcode_exit (void) { - misc_deregister(µcode_dev); + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + lock_cpu_hotplug(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + unlock_cpu_hotplug(); + + platform_device_unregister(microcode_pdev); } module_init(microcode_init) module_exit(microcode_exit) -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 6b1392d33ed5..442aaf8c77eb 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -17,7 +17,6 @@ #include <linux/init.h> #include <linux/acpi.h> #include <linux/delay.h> -#include <linux/config.h> #include <linux/bootmem.h> #include <linux/smp_lock.h> #include <linux/kernel_stat.h> @@ -31,6 +30,7 @@ #include <asm/io_apic.h> #include <mach_apic.h> +#include <mach_apicdef.h> #include <mach_mpparse.h> #include <bios_ebda.h> @@ -69,7 +69,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -static unsigned int __devinitdata num_processors; +unsigned int __cpuinitdata num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -229,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m) mpc_oem_bus_info(m, str, translation_table[mpc_record]); +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; @@ -294,19 +296,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); } #ifdef CONFIG_X86_NUMAQ @@ -823,8 +812,7 @@ int es7000_plat; #ifdef CONFIG_ACPI -void __init mp_register_lapic_address ( - u64 address) +void __init mp_register_lapic_address(u64 address) { mp_lapic_addr = (unsigned long) address; @@ -836,13 +824,10 @@ void __init mp_register_lapic_address ( Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); } - -void __devinit mp_register_lapic ( - u8 id, - u8 enabled) +void __devinit mp_register_lapic (u8 id, u8 enabled) { struct mpc_config_processor processor; - int boot_cpu = 0; + int boot_cpu = 0; if (MAX_APICS - id <= 0) { printk(KERN_WARNING "Processor #%d invalid (max %d)\n", @@ -879,11 +864,9 @@ static struct mp_ioapic_routing { u32 pin_programmed[4]; } mp_ioapic_routing[MAX_IO_APICS]; - -static int mp_find_ioapic ( - int gsi) +static int mp_find_ioapic (int gsi) { - int i = 0; + int i = 0; /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { @@ -896,15 +879,11 @@ static int mp_find_ioapic ( return -1; } - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { - int idx = 0; - int tmpid; + int idx = 0; + int tmpid; if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " @@ -950,16 +929,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end); - - return; } - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) { struct mpc_config_intsrc intsrc; int ioapic = -1; @@ -997,15 +970,13 @@ void __init mp_override_legacy_irq ( mp_irqs[mp_irq_entries] = intsrc; if (++mp_irq_entries == MAX_IRQ_SOURCES) panic("Max # of irq sources exceeded!\n"); - - return; } void __init mp_config_acpi_legacy_irqs (void) { struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; + int i = 0; + int ioapic = -1; /* * Fabricate the legacy ISA bus (bus #31). @@ -1074,12 +1045,12 @@ void __init mp_config_acpi_legacy_irqs (void) #define MAX_GSI_NUM 4096 -int mp_register_gsi (u32 gsi, int triggering, int polarity) +int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - static int pci_irq = 16; + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; /* * Mapping between Global System Interrups, which * represent all possible interrupts, and IRQs diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c index 7a328230e540..d535cdbbfd25 100644 --- a/arch/i386/kernel/msr.c +++ b/arch/i386/kernel/msr.c @@ -24,7 +24,6 @@ */ #include <linux/module.h> -#include <linux/config.h> #include <linux/types.h> #include <linux/errno.h> @@ -251,7 +250,9 @@ static int msr_class_device_create(int i) return err; } -static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +#ifdef CONFIG_HOTPLUG_CPU +static int msr_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -266,10 +267,11 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti return NOTIFY_OK; } -static struct notifier_block msr_class_cpu_notifier = +static struct notifier_block __cpuinitdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; +#endif static int __init msr_init(void) { @@ -292,7 +294,7 @@ static int __init msr_init(void) if (err != 0) goto out_class; } - register_cpu_notifier(&msr_class_cpu_notifier); + register_hotcpu_notifier(&msr_class_cpu_notifier); err = 0; goto out; @@ -315,7 +317,7 @@ static void __exit msr_exit(void) class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); class_destroy(msr_class); unregister_chrdev(MSR_MAJOR, "cpu/msr"); - unregister_cpu_notifier(&msr_class_cpu_notifier); + unregister_hotcpu_notifier(&msr_class_cpu_notifier); } module_init(msr_init); diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index d43b498ec745..0fc4997fb143 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -14,91 +14,184 @@ */ #include <linux/config.h> -#include <linux/mm.h> #include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> #include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> #include <linux/module.h> #include <linux/nmi.h> #include <linux/sysdev.h> #include <linux/sysctl.h> +#include <linux/percpu.h> +#include <linux/dmi.h> +#include <linux/kprobes.h> #include <asm/smp.h> -#include <asm/div64.h> #include <asm/nmi.h> +#include <asm/kdebug.h> +#include <asm/intel_arch_perfmon.h> #include "mach_traps.h" -unsigned int nmi_watchdog = NMI_NONE; -extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; -static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ -static unsigned int nmi_p4_cccr_val; -extern void show_registers(struct pt_regs *regs); +int unknown_nmi_panic; +int nmi_watchdog_enabled; -/* - * lapic_nmi_owner tracks the ownership of the lapic NMI hardware: - * - it may be reserved by some other driver, or not - * - when not reserved by some other driver, it may be used for - * the NMI watchdog, or not - * - * This is maintained separately from nmi_active because the NMI - * watchdog may also be driven from the I/O APIC timer. +/* perfctr_nmi_owner tracks the ownership of the perfctr registers: + * evtsel_nmi_owner tracks the ownership of the event selection + * - different performance counters/ event selection may be reserved for + * different subsystems this reservation system just tries to coordinate + * things a little */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); -static unsigned int lapic_nmi_owner; -#define LAPIC_NMI_WATCHDOG (1<<0) -#define LAPIC_NMI_RESERVED (1<<1) +static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); +static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); + +/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's + * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) + */ +#define NMI_MAX_COUNTER_BITS 66 /* nmi_active: - * +1: the lapic NMI watchdog is active, but can be disabled - * 0: the lapic NMI watchdog has not been set up, and cannot + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot * be enabled - * -1: the lapic NMI watchdog is disabled, but can be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled */ -int nmi_active; +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED +struct nmi_watchdog_ctlblk { + int enabled; + u64 check_bit; + unsigned int cccr_msr; + unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ + unsigned int evntsel_msr; /* the MSR to select the events to handle */ +}; +static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); -#define MSR_P4_MISC_ENABLE 0x1A0 -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12) -#define MSR_P4_PERFCTR0 0x300 -#define MSR_P4_CCCR0 0x360 -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ -#define MSR_P4_IQ_COUNTER0 0x30C -#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR) -#define P4_NMI_IQ_CCCR0 \ - (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \ - P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE) +/* local prototypes */ +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); + +extern void show_registers(struct pt_regs *regs); +extern int unknown_nmi_panic; + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the performance counter register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_PERFCTR0); + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_PERFCTR0); + + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_PERFCTR0); + case 15: + return (msr - MSR_P4_BPU_PERFCTR0); + } + } + return 0; +} + +/* converts an msr to an appropriate reservation bit */ +static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) +{ + /* returns the bit offset of the event selection register */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return (msr - MSR_K7_EVNTSEL0); + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return (msr - MSR_ARCH_PERFMON_EVENTSEL0); + + switch (boot_cpu_data.x86) { + case 6: + return (msr - MSR_P6_EVNTSEL0); + case 15: + return (msr - MSR_P4_BSU_ESCR0); + } + } + return 0; +} + +/* checks for a bit availability (hack for oprofile) */ +int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) +{ + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +/* checks the an msr for availability */ +int avail_to_resrv_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner))); +} + +int reserve_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner))) + return 1; + return 0; +} + +void release_perfctr_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_perfctr_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner)); +} + +int reserve_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0])) + return 1; + return 0; +} + +void release_evntsel_nmi(unsigned int msr) +{ + unsigned int counter; + + counter = nmi_evntsel_msr_to_bit(msr); + BUG_ON(counter > NMI_MAX_COUNTER_BITS); + + clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]); +} + +static __cpuinit inline int nmi_known_cpu(void) +{ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return 1; + else + return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); + } + return 0; +} #ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when @@ -108,7 +201,7 @@ int nmi_active; static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; - local_irq_enable(); + local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -126,7 +219,18 @@ static int __init check_nmi_watchdog(void) unsigned int *prev_nmi_count; int cpu; - if (nmi_watchdog == NMI_NONE) + /* Enable NMI watchdog for newer systems. + Actually it should be safe for most systems before 2004 too except + for some IBM systems that corrupt registers when NMI happens + during SMM. Unfortunately we don't have more exact information + on these and use this coarse check. */ + if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) + nmi_watchdog = NMI_LOCAL_APIC; + + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + return 0; + + if (!atomic_read(&nmi_active)) return 0; prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); @@ -150,25 +254,45 @@ static int __init check_nmi_watchdog(void) if (!cpu_isset(cpu, cpu_callin_map)) continue; #endif + if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + continue; if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - endflag = 1; printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], nmi_count(cpu)); - nmi_active = 0; - lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG; - kfree(prev_nmi_count); - return -1; + per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + atomic_dec(&nmi_active); } } + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + return -1; + } endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) + if (nmi_watchdog == NMI_LOCAL_APIC) { + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + nmi_hz = 1; + /* + * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter + * are writable, with higher bits sign extending from bit 31. + * So, we can only program the counter with 31 bit values and + * 32nd bit should be 1, for 33.. to be 1. + * Find the appropriate nmi_hz + */ + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 && + ((u64)cpu_khz * 1000) > 0x7fffffffULL) { + u64 count = (u64)cpu_khz * 1000; + do_div(count, 0x7fffffffUL); + nmi_hz = count + 1; + } + } kfree(prev_nmi_count); return 0; @@ -182,31 +306,16 @@ static int __init setup_nmi_watchdog(char *str) get_option(&str, &nmi); - if (nmi >= NMI_INVALID) + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) return 0; - if (nmi == NMI_NONE) - nmi_watchdog = nmi; /* * If any other x86 CPU has a local APIC, then * please test the NMI stuff there and send me the * missing bits. Right now Intel P6/P4 and AMD K7 only. */ - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - if ((nmi == NMI_LOCAL_APIC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && - (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15)) - nmi_watchdog = nmi; - /* - * We can enable the IO-APIC watchdog - * unconditionally. - */ - if (nmi == NMI_IO_APIC) { - nmi_active = 1; - nmi_watchdog = nmi; - } + if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0)) + return 0; /* no lapic support */ + nmi_watchdog = nmi; return 1; } @@ -214,86 +323,53 @@ __setup("nmi_watchdog=", setup_nmi_watchdog); static void disable_lapic_nmi_watchdog(void) { - if (nmi_active <= 0) + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - wrmsr(MSR_K7_EVNTSEL0, 0, 0); - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - break; - wrmsr(MSR_P6_EVNTSEL0, 0, 0); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) - break; + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - wrmsr(MSR_P4_IQ_CCCR0, 0, 0); - wrmsr(MSR_P4_CRU_ESCR0, 0, 0); - break; - } - break; - } - nmi_active = -1; - /* tell do_nmi() and others that we're not active any more */ - nmi_watchdog = 0; + BUG_ON(atomic_read(&nmi_active) != 0); } static void enable_lapic_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_LOCAL_APIC; - setup_apic_nmi_watchdog(); - } -} - -int reserve_lapic_nmi(void) -{ - unsigned int old_owner; + BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - spin_lock(&lapic_nmi_owner_lock); - old_owner = lapic_nmi_owner; - lapic_nmi_owner |= LAPIC_NMI_RESERVED; - spin_unlock(&lapic_nmi_owner_lock); - if (old_owner & LAPIC_NMI_RESERVED) - return -EBUSY; - if (old_owner & LAPIC_NMI_WATCHDOG) - disable_lapic_nmi_watchdog(); - return 0; -} + /* are we already enabled */ + if (atomic_read(&nmi_active) != 0) + return; -void release_lapic_nmi(void) -{ - unsigned int new_owner; + /* are we lapic aware */ + if (nmi_known_cpu() <= 0) + return; - spin_lock(&lapic_nmi_owner_lock); - new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED; - lapic_nmi_owner = new_owner; - spin_unlock(&lapic_nmi_owner_lock); - if (new_owner & LAPIC_NMI_WATCHDOG) - enable_lapic_nmi_watchdog(); + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + touch_nmi_watchdog(); } void disable_timer_nmi_watchdog(void) { - if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0)) + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) <= 0) return; - unset_nmi_callback(); - nmi_active = -1; - nmi_watchdog = NMI_NONE; + disable_irq(0); + on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); + + BUG_ON(atomic_read(&nmi_active) != 0); } void enable_timer_nmi_watchdog(void) { - if (nmi_active < 0) { - nmi_watchdog = NMI_IO_APIC; + BUG_ON(nmi_watchdog != NMI_IO_APIC); + + if (atomic_read(&nmi_active) == 0) { touch_nmi_watchdog(); - nmi_active = 1; + on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); + enable_irq(0); } } @@ -303,15 +379,20 @@ static int nmi_pm_active; /* nmi_active before suspend */ static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) { - nmi_pm_active = nmi_active; - disable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); return 0; } static int lapic_nmi_resume(struct sys_device *dev) { - if (nmi_pm_active > 0) - enable_lapic_nmi_watchdog(); + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } return 0; } @@ -331,7 +412,13 @@ static int __init init_lapic_nmi_sysfs(void) { int error; - if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC) + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if ( atomic_read(&nmi_active) < 0 ) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -349,138 +436,415 @@ late_initcall(init_lapic_nmi_sysfs); * Original code written by Keith Owens. */ -static void clear_msr_range(unsigned int base, unsigned int n) -{ - unsigned int i; - - for(i = 0; i < n; ++i) - wrmsr(base+i, 0, 0); -} - -static void write_watchdog_counter(const char *descr) +static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr) { u64 count = (u64)cpu_khz * 1000; do_div(count, nmi_hz); if(descr) Dprintk("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(nmi_perfctr_msr, 0 - count); + wrmsrl(perfctr_msr, 0 - count); } -static void setup_k7_watchdog(void) +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define K7_EVNTSEL_ENABLE (1 << 22) +#define K7_EVNTSEL_INT (1 << 20) +#define K7_EVNTSEL_OS (1 << 17) +#define K7_EVNTSEL_USR (1 << 16) +#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 +#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING + +static int setup_k7_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_K7_PERFCTR0; + perfctr_msr = MSR_K7_PERFCTR0; + evntsel_msr = MSR_K7_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; - clear_msr_range(MSR_K7_EVNTSEL0, 4); - clear_msr_range(MSR_K7_PERFCTR0, 4); + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); evntsel = K7_EVNTSEL_INT | K7_EVNTSEL_OS | K7_EVNTSEL_USR | K7_NMI_EVENT; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - write_watchdog_counter("K7_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<63; + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; } -static void setup_p6_watchdog(void) +static void stop_k7_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +#define P6_EVNTSEL0_ENABLE (1 << 22) +#define P6_EVNTSEL_INT (1 << 20) +#define P6_EVNTSEL_OS (1 << 17) +#define P6_EVNTSEL_USR (1 << 16) +#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 +#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED + +static int setup_p6_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr; unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_perfctr_msr = MSR_P6_PERFCTR0; + perfctr_msr = MSR_P6_PERFCTR0; + evntsel_msr = MSR_P6_EVNTSEL0; + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; - clear_msr_range(MSR_P6_EVNTSEL0, 2); - clear_msr_range(MSR_P6_PERFCTR0, 2); + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); evntsel = P6_EVNTSEL_INT | P6_EVNTSEL_OS | P6_EVNTSEL_USR | P6_NMI_EVENT; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - write_watchdog_counter("P6_PERFCTR0"); + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL<<39; + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; } +static void stop_p6_watchdog(void) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +/* Note that these events don't tick when the CPU idles. This means + the frequency varies with CPU load. */ + +#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) +#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) +#define P4_ESCR_OS (1<<3) +#define P4_ESCR_USR (1<<2) +#define P4_CCCR_OVF_PMI0 (1<<26) +#define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_THRESHOLD(N) ((N)<<20) +#define P4_CCCR_COMPLEMENT (1<<19) +#define P4_CCCR_COMPARE (1<<18) +#define P4_CCCR_REQUIRED (3<<16) +#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) +#define P4_CCCR_ENABLE (1<<12) +#define P4_CCCR_OVF (1<<31) +/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter + CRU_ESCR0 (with any non-null event selector) through a complemented + max threshold. [IA32-Vol3, Section 14.9.9] */ + static int setup_p4_watchdog(void) { + unsigned int perfctr_msr, evntsel_msr, cccr_msr; + unsigned int evntsel, cccr_val; unsigned int misc_enable, dummy; + unsigned int ht_num; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy); + rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) return 0; - nmi_perfctr_msr = MSR_P4_IQ_COUNTER0; - nmi_p4_cccr_val = P4_NMI_IQ_CCCR0; #ifdef CONFIG_SMP - if (smp_num_siblings == 2) - nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1; + /* detect which hyperthread we are on */ + if (smp_num_siblings == 2) { + unsigned int ebx, apicid; + + ebx = cpuid_ebx(1); + apicid = (ebx >> 24) & 0xff; + ht_num = apicid & 1; + } else #endif + ht_num = 0; - if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL)) - clear_msr_range(0x3F1, 2); - /* MSR 0x3F0 seems to have a default value of 0xFC00, but current - docs doesn't fully define it, so leave it alone for now. */ - if (boot_cpu_data.x86_model >= 0x3) { - /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */ - clear_msr_range(0x3A0, 26); - clear_msr_range(0x3BC, 3); + /* performance counters are shared resources + * assign each hyperthread its own set + * (re-use the ESCR0 register, seems safe + * and keeps the cccr_val the same) + */ + if (!ht_num) { + /* logical cpu 0 */ + perfctr_msr = MSR_P4_IQ_PERFCTR0; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR0; + cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); } else { - clear_msr_range(0x3A0, 31); + /* logical cpu 1 */ + perfctr_msr = MSR_P4_IQ_PERFCTR1; + evntsel_msr = MSR_P4_CRU_ESCR0; + cccr_msr = MSR_P4_IQ_CCCR1; + cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); } - clear_msr_range(0x3C0, 6); - clear_msr_range(0x3C8, 6); - clear_msr_range(0x3E0, 2); - clear_msr_range(MSR_P4_CCCR0, 18); - clear_msr_range(MSR_P4_PERFCTR0, 18); - - wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); - wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - write_watchdog_counter("P4_IQ_COUNTER0"); + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + evntsel = P4_ESCR_EVENT_SELECT(0x3F) + | P4_ESCR_OS + | P4_ESCR_USR; + + cccr_val |= P4_CCCR_THRESHOLD(15) + | P4_CCCR_COMPLEMENT + | P4_CCCR_COMPARE + | P4_CCCR_REQUIRED; + + wrmsr(evntsel_msr, evntsel, 0); + wrmsr(cccr_msr, cccr_val, 0); + write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = cccr_msr; + wd->check_bit = 1ULL<<39; return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; } -void setup_apic_nmi_watchdog (void) +static void stop_p4_watchdog(void) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) - return; - setup_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 0xd) - return; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - setup_p6_watchdog(); - break; - case 15: - if (boot_cpu_data.x86_model > 0x4) + wrmsr(wd->cccr_msr, 0, 0); + wrmsr(wd->evntsel_msr, 0, 0); + + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL +#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK + +static int setup_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int perfctr_msr, evntsel_msr; + unsigned int evntsel; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + goto fail; + + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + + if (!reserve_perfctr_nmi(perfctr_msr)) + goto fail; + + if (!reserve_evntsel_nmi(evntsel_msr)) + goto fail1; + + wrmsrl(perfctr_msr, 0UL); + + evntsel = ARCH_PERFMON_EVENTSEL_INT + | ARCH_PERFMON_EVENTSEL_OS + | ARCH_PERFMON_EVENTSEL_USR + | ARCH_PERFMON_NMI_EVENT_SEL + | ARCH_PERFMON_NMI_EVENT_UMASK; + + /* setup the timer */ + wrmsr(evntsel_msr, evntsel, 0); + write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + + wd->perfctr_msr = perfctr_msr; + wd->evntsel_msr = evntsel_msr; + wd->cccr_msr = 0; //unused + wd->check_bit = 1ULL << (eax.split.bit_width - 1); + return 1; +fail1: + release_perfctr_nmi(perfctr_msr); +fail: + return 0; +} + +static void stop_intel_arch_watchdog(void) +{ + unsigned int ebx; + union cpuid10_eax eax; + unsigned int unused; + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* + * Check whether the Architectural PerfMon supports + * Unhalted Core Cycles Event or not. + * NOTE: Corresponding bit = 0 in ebx indicates event present. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || + (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) + return; + + wrmsr(wd->evntsel_msr, 0, 0); + release_evntsel_nmi(wd->evntsel_msr); + release_perfctr_nmi(wd->perfctr_msr); +} + +void setup_apic_nmi_watchdog (void *unused) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (wd->enabled == 1) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) + return; + if (!setup_k7_watchdog()) return; + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + if (!setup_intel_arch_watchdog()) + return; + break; + } + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + return; + + if (!setup_p6_watchdog()) + return; + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + return; - if (!setup_p4_watchdog()) + if (!setup_p4_watchdog()) + return; + break; + default: return; + } break; default: return; } - break; - default: + } + wd->enabled = 1; + atomic_inc(&nmi_active); +} + +void stop_apic_nmi_watchdog(void *unused) +{ + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + + if (wd->enabled == 0) return; + + if (nmi_watchdog == NMI_LOCAL_APIC) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + stop_k7_watchdog(); + break; + case X86_VENDOR_INTEL: + if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + stop_intel_arch_watchdog(); + break; + } + switch (boot_cpu_data.x86) { + case 6: + if (boot_cpu_data.x86_model > 0xd) + break; + stop_p6_watchdog(); + break; + case 15: + if (boot_cpu_data.x86_model > 0x4) + break; + stop_p4_watchdog(); + break; + } + break; + default: + return; + } } - lapic_nmi_owner = LAPIC_NMI_WATCHDOG; - nmi_active = 1; + wd->enabled = 0; + atomic_dec(&nmi_active); } /* @@ -518,10 +882,11 @@ void touch_nmi_watchdog (void) */ touch_softlockup_watchdog(); } +EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { /* @@ -530,11 +895,23 @@ void nmi_watchdog_tick (struct pt_regs * regs) * smp_processor_id(). */ unsigned int sum; + int touched = 0; int cpu = smp_processor_id(); + struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + u64 dummy; + int rc=0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } sum = per_cpu(irq_stat, cpu).apic_timer_irqs; - if (last_irq_sums[cpu] == sum) { + /* if the apic timer isn't firing, this cpu isn't doing much */ + if (!touched && last_irq_sums[cpu] == sum) { /* * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... @@ -549,26 +926,59 @@ void nmi_watchdog_tick (struct pt_regs * regs) last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; } - if (nmi_perfctr_msr) { - if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. + /* see if the nmi watchdog went off */ + if (wd->enabled) { + if (nmi_watchdog == NMI_LOCAL_APIC) { + rdmsrl(wd->perfctr_msr, dummy); + if (dummy & wd->check_bit){ + /* this wasn't a watchdog timer interrupt */ + goto done; + } + + /* only Intel P4 uses the cccr msr */ + if (wd->cccr_msr != 0) { + /* + * P4 quirks: + * - An overflown perfctr will assert its interrupt + * until the OVF flag in its CCCR is cleared. + * - LVTPC is masked on interrupt and must be + * unmasked by the LVTPC handler. + */ + rdmsrl(wd->cccr_msr, dummy); + dummy &= ~P4_CCCR_OVF; + wrmsrl(wd->cccr_msr, dummy); + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || + wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + /* P6 based Pentium M need to re-unmask + * the apic vector but it doesn't hurt + * other P6 variant. + * ArchPerfom/Core Duo also needs this */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + } + /* start the cycle over again */ + write_watchdog_counter(wd->perfctr_msr, NULL); + rc = 1; + } else if (nmi_watchdog == NMI_IO_APIC) { + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. */ - wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); - apic_write(APIC_LVTPC, APIC_DM_NMI); + rc = 1; } - else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) { - /* Only P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - } - write_watchdog_counter(NULL); } +done: + return rc; +} + +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; } #ifdef CONFIG_SYSCTL @@ -578,36 +988,46 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) unsigned char reason = get_nmi_reason(); char buf[64]; - if (!(reason & 0xc0)) { - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(regs, buf); - } + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); return 0; } /* - * proc handler for /proc/sys/kernel/unknown_nmi_panic + * proc handler for /proc/sys/kernel/nmi */ -int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; - old_state = unknown_nmi_panic; + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; proc_dointvec(table, write, file, buffer, length, ppos); - if (!!old_state == !!unknown_nmi_panic) + if (!!old_state == !!nmi_watchdog_enabled) return 0; - if (unknown_nmi_panic) { - if (reserve_lapic_nmi() < 0) { - unknown_nmi_panic = 0; - return -EBUSY; - } else { - set_nmi_callback(unknown_nmi_panic_callback); - } + if (atomic_read(&nmi_active) < 0) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + if (nmi_watchdog == NMI_DEFAULT) { + if (nmi_known_cpu() > 0) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); } else { - release_lapic_nmi(); - unset_nmi_callback(); + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; } return 0; } @@ -616,7 +1036,11 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file, EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(reserve_lapic_nmi); -EXPORT_SYMBOL(release_lapic_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); +EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); +EXPORT_SYMBOL(reserve_perfctr_nmi); +EXPORT_SYMBOL(release_perfctr_nmi); +EXPORT_SYMBOL(reserve_evntsel_nmi); +EXPORT_SYMBOL(release_evntsel_nmi); EXPORT_SYMBOL(disable_timer_nmi_watchdog); EXPORT_SYMBOL(enable_timer_nmi_watchdog); diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index 5f5b075f860a..9000d82c6dc0 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -23,7 +23,6 @@ * Send feedback to <gone@us.ibm.com> */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/bootmem.h> #include <linux/mmzone.h> @@ -79,10 +78,12 @@ int __init get_memcfg_numaq(void) return 1; } -static int __init numaq_dsc_disable(void) +static int __init numaq_tsc_disable(void) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + } return 0; } -core_initcall(numaq_dsc_disable); +arch_initcall(numaq_tsc_disable); diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 6259afea46d1..8c190ca7ae44 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -28,7 +28,6 @@ #include <linux/user.h> #include <linux/a.out.h> #include <linux/interrupt.h> -#include <linux/config.h> #include <linux/utsname.h> #include <linux/delay.h> #include <linux/reboot.h> @@ -38,6 +37,7 @@ #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/personality.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -102,7 +102,7 @@ void default_idle(void) local_irq_enable(); if (!hlt_counter && boot_cpu_data.hlt_works_ok) { - clear_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); while (!need_resched()) { local_irq_disable(); @@ -111,7 +111,7 @@ void default_idle(void) else local_irq_enable(); } - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; } else { while (!need_resched()) cpu_relax(); @@ -174,7 +174,7 @@ void cpu_idle(void) { int cpu = smp_processor_id(); - set_thread_flag(TIF_POLLING_NRFLAG); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { @@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs) cr3 = read_cr3(); cr4 = read_cr4_safe(); printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); - show_trace(NULL, ®s->esp); + show_trace(NULL, regs, ®s->esp); } /* @@ -321,15 +321,6 @@ void show_regs(struct pt_regs * regs) * the "args". */ extern void kernel_thread_helper(void); -__asm__(".section .text\n" - ".align 4\n" - "kernel_thread_helper:\n\t" - "movl %edx,%eax\n\t" - "pushl %edx\n\t" - "call *%ebx\n\t" - "pushl %eax\n\t" - "call do_exit\n" - ".previous"); /* * Create a kernel thread @@ -347,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) regs.xes = __USER_DS; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ @@ -360,16 +351,16 @@ EXPORT_SYMBOL(kernel_thread); */ void exit_thread(void) { - struct task_struct *tsk = current; - struct thread_struct *t = &tsk->thread; - /* The process may have allocated an io port bitmap... nuke it. */ - if (unlikely(NULL != t->io_bitmap_ptr)) { + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { + struct task_struct *tsk = current; + struct thread_struct *t = &tsk->thread; int cpu = get_cpu(); struct tss_struct *tss = &per_cpu(init_tss, cpu); kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ @@ -388,6 +379,7 @@ void flush_thread(void) memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + clear_tsk_thread_flag(tsk, TIF_DEBUG); /* * Forget coprocessor state.. */ @@ -432,7 +424,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, savesegment(gs,p->thread.gs); tsk = current; - if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; @@ -440,6 +432,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, } memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, IO_BITMAP_BYTES); + set_tsk_thread_flag(p, TIF_IO_BITMAP); } /* @@ -534,10 +527,24 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) return 1; } -static inline void -handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) +static noinline void __switch_to_xtra(struct task_struct *next_p, + struct tss_struct *tss) { - if (!next->io_bitmap_ptr) { + struct thread_struct *next; + + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg[6], 6); + set_debugreg(next->debugreg[7], 7); + } + + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache * the previous bitmap owner and the IO bitmap contents: @@ -545,6 +552,7 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; return; } + if (likely(next == tss->io_bitmap_owner)) { /* * Previous owner of the bitmap (hence the bitmap content) @@ -672,20 +680,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas set_iopl_mask(next->iopl); /* - * Now maybe reload the debug registers + * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely(next->debugreg[7])) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); - } - - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) - handle_io_bitmap(next, tss); + if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) + || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) + __switch_to_xtra(next_p, tss); disable_tsc(prev_p, next_p); @@ -898,7 +897,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) unsigned long arch_align_stack(unsigned long sp) { - if (randomize_va_space) + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index fd7eaf7866e0..775f50e9395b 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_ return addr; } -static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs) +static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) { int i, copied; - unsigned char opcode[16]; + unsigned char opcode[15]; unsigned long addr = convert_eip_to_linear(child, regs); copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); for (i = 0; i < copied; i++) { switch (opcode[i]) { - /* popf */ - case 0x9d: + /* popf and iret */ + case 0x9d: case 0xcf: return 1; /* opcode and address size prefixes */ case 0x66: case 0x67: @@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child) * don't mark it as being "us" that set it, so that we * won't clear it by hand later. */ - if (is_at_popf(child, regs)) + if (is_setting_trap_flag(child, regs)) return; child->ptrace |= PT_DTRACE; @@ -468,8 +468,11 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) for(i=0; i<4; i++) if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) goto out_tsk; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); } - addr -= (long) &dummy->u_debugreg; addr = addr >> 2; child->thread.debugreg[addr] = data; diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index 87ccdac84928..9f6ab1789bb0 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -1,7 +1,6 @@ /* * This file contains work-arounds for x86 and x86_64 platform bugs. */ -#include <linux/config.h> #include <linux/pci.h> #include <linux/irq.h> diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index d207242976d3..84278e0093a2 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -2,7 +2,6 @@ * linux/arch/i386/kernel/reboot.c */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/delay.h> @@ -146,14 +145,10 @@ real_mode_gdt_entries [3] = 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct -{ - unsigned short size __attribute__ ((packed)); - unsigned long long * base __attribute__ ((packed)); -} -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, NULL }, -no_idt = { 0, NULL }; +static struct Xgt_desc_struct +real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, +real_mode_idt = { 0x3ff, 0 }, +no_idt = { 0, 0 }; /* This is 16-bit protected mode code to disable paging and the cache, diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S index d312616effa1..f151d6fae462 100644 --- a/arch/i386/kernel/relocate_kernel.S +++ b/arch/i386/kernel/relocate_kernel.S @@ -7,16 +7,138 @@ */ #include <linux/linkage.h> +#include <asm/page.h> +#include <asm/kexec.h> + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 2) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ +#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ + + .text + .align PAGE_ALIGNED + .globl relocate_kernel +relocate_kernel: + movl 8(%esp), %ebp /* list of pages */ + +#ifdef CONFIG_X86_PAE + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_0)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_1)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#else + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#endif - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel relocate_new_kernel: /* read the arguments and say goodbye to the stack */ movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 8(%esp), %ebp /* list of pages */ movl 12(%esp), %edx /* start address */ movl 16(%esp), %ecx /* cpu_has_pae */ @@ -24,11 +146,26 @@ relocate_new_kernel: pushl $0 popfl - /* set a new stack at the bottom of our page... */ - lea 4096(%ebp), %esp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi - /* store the parameters back on the stack */ - pushl %edx /* store the start address */ + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ret + +identity_mapped: + /* store the start address on the stack */ + pushl %edx /* Set cr0 to a known state: * 31 0 == Paging disabled @@ -113,8 +250,3 @@ relocate_new_kernel: xorl %edi, %edi xorl %ebp, %ebp ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .long relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c index 321f5fd26e75..c7d3df23f589 100644 --- a/arch/i386/kernel/scx200.c +++ b/arch/i386/kernel/scx200.c @@ -4,11 +4,11 @@ National Semiconductor SCx200 support. */ -#include <linux/config.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> +#include <linux/mutex.h> #include <linux/pci.h> #include <linux/scx200.h> @@ -45,11 +45,19 @@ static struct pci_driver scx200_pci_driver = { .probe = scx200_probe, }; -static DEFINE_SPINLOCK(scx200_gpio_config_lock); +static DEFINE_MUTEX(scx200_gpio_config_lock); -static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +static void __devinit scx200_init_shadow(void) { int bank; + + /* read the current values driven on the GPIO signals */ + for (bank = 0; bank < 2; ++bank) + scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); +} + +static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ unsigned base; if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || @@ -63,10 +71,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ } scx200_gpio_base = base; - - /* read the current values driven on the GPIO signals */ - for (bank = 0; bank < 2; ++bank) - scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); + scx200_init_shadow(); } else { /* find the base of the Configuration Block */ @@ -87,12 +92,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_ return 0; } -u32 scx200_gpio_configure(int index, u32 mask, u32 bits) +u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) { u32 config, new_config; - unsigned long flags; - spin_lock_irqsave(&scx200_gpio_config_lock, flags); + mutex_lock(&scx200_gpio_config_lock); outl(index, scx200_gpio_base + 0x20); config = inl(scx200_gpio_base + 0x24); @@ -100,45 +104,11 @@ u32 scx200_gpio_configure(int index, u32 mask, u32 bits) new_config = (config & mask) | bits; outl(new_config, scx200_gpio_base + 0x24); - spin_unlock_irqrestore(&scx200_gpio_config_lock, flags); + mutex_unlock(&scx200_gpio_config_lock); return config; } -#if 0 -void scx200_gpio_dump(unsigned index) -{ - u32 config = scx200_gpio_configure(index, ~0, 0); - printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config); - - if (config & 1) - printk(" OE"); /* output enabled */ - else - printk(" TS"); /* tristate */ - if (config & 2) - printk(" PP"); /* push pull */ - else - printk(" OD"); /* open drain */ - if (config & 4) - printk(" PUE"); /* pull up enabled */ - else - printk(" PUD"); /* pull up disabled */ - if (config & 8) - printk(" LOCKED"); /* locked */ - if (config & 16) - printk(" LEVEL"); /* level input */ - else - printk(" EDGE"); /* edge input */ - if (config & 32) - printk(" HI"); /* trigger on rising edge */ - else - printk(" LO"); /* trigger on falling edge */ - if (config & 64) - printk(" DEBOUNCE"); /* debounce */ - printk("\n"); -} -#endif /* 0 */ - static int __init scx200_init(void) { printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); @@ -159,10 +129,3 @@ EXPORT_SYMBOL(scx200_gpio_base); EXPORT_SYMBOL(scx200_gpio_shadow); EXPORT_SYMBOL(scx200_gpio_configure); EXPORT_SYMBOL(scx200_cb_base); - -/* - Local variables: - compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules" - c-basic-offset: 8 - End: -*/ diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c deleted file mode 100644 index 967dc74df9ee..000000000000 --- a/arch/i386/kernel/semaphore.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * i386 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> - */ -#include <linux/config.h> -#include <asm/semaphore.h> - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax whish is either a return - * value or just clobbered.. - */ -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_interruptible\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" -#if defined(CONFIG_FRAME_POINTER) - "pushl %ebp\n\t" - "movl %esp,%ebp\n\t" -#endif - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __down_trylock\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" -#if defined(CONFIG_FRAME_POINTER) - "movl %ebp,%esp\n\t" - "popl %ebp\n\t" -#endif - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" - "pushl %edx\n\t" - "pushl %ecx\n\t" - "call __up\n\t" - "popl %ecx\n\t" - "popl %edx\n\t" - "ret" -); - -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK_PREFIX "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK_PREFIX "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index e6023970aa40..000cf03751fe 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -23,11 +23,10 @@ * This file handles the architecture-dependent parts of initialization */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/mmzone.h> -#include <linux/tty.h> +#include <linux/screen_info.h> #include <linux/ioport.h> #include <linux/acpi.h> #include <linux/apm_bios.h> @@ -48,20 +47,20 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> -#include <linux/suspend.h> #include <video/edid.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> +#include <asm/mmzone.h> #include <asm/setup.h> #include <asm/arch_hooks.h> #include <asm/sections.h> #include <asm/io_apic.h> #include <asm/ist.h> #include <asm/io.h> -#include "setup_arch_pre.h" +#include <setup_arch.h> #include <bios_ebda.h> /* Forward Declaration. */ @@ -91,18 +90,6 @@ EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; -#ifdef CONFIG_ACPI - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - -#ifdef CONFIG_ACPI -int __initdata acpi_force = 0; -extern acpi_interrupt_flags acpi_sci_flags; -#endif - /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; #ifdef CONFIG_MCA @@ -150,7 +137,6 @@ EXPORT_SYMBOL(ist_info); struct e820map e820; extern void early_cpu_init(void); -extern void generic_apic_probe(char *); extern int root_mountflags; unsigned long saved_videomode; @@ -223,9 +209,6 @@ static struct resource adapter_rom_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM } }; -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, @@ -287,9 +270,6 @@ static struct resource standard_io_resources[] = { { .flags = IORESOURCE_BUSY | IORESOURCE_IO } }; -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) static int __init romchecksum(unsigned char *rom, unsigned long length) @@ -346,7 +326,7 @@ static void __init probe_roms(void) } /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { rom = isa_bus_to_virt(start); if (!romsignature(rom)) continue; @@ -411,8 +391,8 @@ static void __init limit_regions(unsigned long long size) } } -static void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) { int x; @@ -475,7 +455,7 @@ static struct change_member *change_point[2*E820MAX] __initdata; static struct e820entry *overlap_list[E820MAX] __initdata; static struct e820entry new_bios[E820MAX] __initdata; -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) { struct change_member *change_tmp; unsigned long current_type, last_type; @@ -644,7 +624,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * thinkpad 560x, for example, does not cooperate with the memory * detection code.) */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) @@ -702,244 +682,150 @@ static inline void copy_edd(void) } #endif +static int __initdata user_defined_memmap = 0; + /* - * Do NOT EVER look at the BIOS memory size location. - * It does not work on many machines. + * "mem=nopentium" disables the 4MB page tables. + * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM + * to <mem>, overriding the bios size. + * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from + * <start> to <start>+<mem>, overriding the bios size. + * + * HPA tells me bootloaders need to parse mem=, so no new + * option should be mem= [also see Documentation/i386/boot.txt] */ -#define LOWMEMSIZE() (0x9f000) - -static void __init parse_cmdline_early (char ** cmdline_p) +static int __init parse_mem(char *arg) { - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; - int userdef = 0; + if (!arg) + return -EINVAL; - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to <mem>, overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * <start> to <start>+<mem>, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] + if (strcmp(arg, "nopentium") == 0) { + clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + disable_pse = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. */ - if (!memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(from+4, &from); - limit_regions(mem_size); - userdef=1; - } - } - - else if (!memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { -#ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is - * reset. - */ - find_max_pfn(); - saved_max_pfn = max_pfn; -#endif - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; + unsigned long long mem_size; - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } - - else if (!memcmp(from, "noexec=", 7)) - noexec_setup(from + 7); + mem_size = memparse(arg, &arg); + limit_regions(mem_size); + user_defined_memmap = 1; + } + return 0; +} +early_param("mem", parse_mem); +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } + find_max_pfn(); + saved_max_pfn = max_pfn; #endif - -#ifdef CONFIG_ACPI - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { - disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } - - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_disable_pci(); - } - /* "acpi=noirq" disables ACPI interrupt routing */ - else if (!memcmp(from, "acpi=noirq", 10)) { - acpi_noirq_set(); + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; } + } + return 0; +} +early_param("memmap", parse_memmap); - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; +#ifdef CONFIG_PROC_VMCORE +/* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ +static int __init parse_elfcorehdr(char *arg) +{ + if (!arg) + return -EINVAL; - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; + elfcorehdr_addr = memparse(arg, &arg); + return 0; +} +early_param("elfcorehdr", parse_elfcorehdr); +#endif /* CONFIG_PROC_VMCORE */ -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; +/* + * highmem=size forces highmem to be exactly 'size' bytes. + * This works even on boxes that have no highmem otherwise. + * This also works to reduce highmem size on bigger boxes. + */ +static int __init parse_highmem(char *arg) +{ + if (!arg) + return -EINVAL; - if (!memcmp(from, "disable_timer_pin_1", 19)) - disable_timer_pin_1 = 1; - if (!memcmp(from, "enable_timer_pin_1", 18)) - disable_timer_pin_1 = -1; + highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; + return 0; +} +early_param("highmem", parse_highmem); - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); -#endif /* CONFIG_X86_IO_APIC */ -#endif /* CONFIG_ACPI */ +/* + * vmalloc=size forces the vmalloc area to be exactly 'size' + * bytes. This can be used to increase (or decrease) the + * vmalloc area - the default is 128m. + */ +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; -#ifdef CONFIG_X86_LOCAL_APIC - /* enable local APIC */ - else if (!memcmp(from, "lapic", 5)) - lapic_enable(); + __VMALLOC_RESERVE = memparse(arg, &arg); + return 0; +} +early_param("vmalloc", parse_vmalloc); - /* disable local APIC */ - else if (!memcmp(from, "nolapic", 6)) - lapic_disable(); -#endif /* CONFIG_X86_LOCAL_APIC */ +/* + * reservetop=size reserves a hole at the top of the kernel address space which + * a hypervisor can load into later. Needed for dynamically loaded hypervisors, + * so relocating the fixmap can be done before paging initialization. + */ +static int __init parse_reservetop(char *arg) +{ + unsigned long address; -#ifdef CONFIG_KEXEC - /* crashkernel=size@addr specifies the location to reserve for - * a crash kernel. By reserving this memory we guarantee - * that linux never set's it up as a DMA target. - * Useful for holding code to do something appropriate - * after a kernel panic. - */ - else if (!memcmp(from, "crashkernel=", 12)) { - unsigned long size, base; - size = memparse(from+12, &from); - if (*from == '@') { - base = memparse(from+1, &from); - /* FIXME: Do I want a sanity check - * to validate the memory range? - */ - crashk_res.start = base; - crashk_res.end = base + size - 1; - } - } -#endif -#ifdef CONFIG_PROC_VMCORE - /* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. - */ - else if (!memcmp(from, "elfcorehdr=", 11)) - elfcorehdr_addr = memparse(from+11, &from); -#endif + if (!arg) + return -EINVAL; - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - else if (!memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - /* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ - else if (!memcmp(from, "vmalloc=", 8)) - __VMALLOC_RESERVE = memparse(from+8, &from); - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } + address = memparse(arg, &arg); + reserve_top_address(address); + return 0; } +early_param("reservetop", parse_reservetop); /* * Callback for efi_memory_walk. @@ -1178,6 +1064,14 @@ static unsigned long __init setup_memory(void) } printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif +#ifdef CONFIG_FLATMEM + max_mapnr = num_physpages; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); @@ -1189,22 +1083,20 @@ static unsigned long __init setup_memory(void) void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, low; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; - - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; #ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = highend_pfn - low; + unsigned long max_zone_pfns[MAX_NR_ZONES] = { + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT, + max_low_pfn, + highend_pfn}; + add_active_range(0, 0, highend_pfn); +#else + unsigned long max_zone_pfns[MAX_NR_ZONES] = { + virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT, + max_low_pfn}; + add_active_range(0, 0, max_low_pfn); #endif - } - free_area_init(zones_size); + + free_area_init_nodes(max_zone_pfns); } #else extern unsigned long __init setup_memory(void); @@ -1266,7 +1158,7 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - + numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { @@ -1321,8 +1213,10 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat probe_roms(); for (i = 0; i < e820.nr_map; i++) { struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) continue; +#endif res = kzalloc(sizeof(struct resource), GFP_ATOMIC); switch (e820.map[i].type) { case E820_RAM: res->name = "System RAM"; break; @@ -1333,7 +1227,10 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat res->start = e820.map[i].addr; res->end = res->start + e820.map[i].size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } if (e820.map[i].type == E820_RAM) { /* * We don't know which RAM region contains kernel data, @@ -1369,7 +1266,7 @@ static int __init request_standard_resources(void) request_resource(&iomem_resource, &video_ram_resource); /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); return 0; } @@ -1424,8 +1321,6 @@ static void __init register_memory(void) pci_mem_start, gapstart, gapsize); } -static char * __init machine_specific_memory_setup(void); - #ifdef CONFIG_MCA static void set_mca_bus(int x) { @@ -1435,111 +1330,6 @@ static void set_mca_bus(int x) static void set_mca_bus(int x) { } #endif -#ifdef CONFIG_SOFTWARE_SUSPEND -static void __init mark_nosave_page_range(unsigned long start, unsigned long end) -{ - struct page *page; - while (start <= end) { - page = pfn_to_page(start); - SetPageNosave(page); - start++; - } -} - -static void __init e820_nosave_reserved_pages(void) -{ - int i; - unsigned long r_start = 0, r_end = 0; - - /* Assume e820 map is sorted */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = PFN_DOWN(ei->addr); - end = PFN_UP(ei->addr + ei->size); - if (start >= end) - continue; - if (ei->type == E820_RESERVED) - continue; - r_end = start; - /* - * Highmem 'Reserved' pages are marked as reserved, swsusp - * will not save/restore them, so we ignore these pages here. - */ - if (r_end > max_low_pfn) - r_end = max_low_pfn; - if (r_end > r_start) - mark_nosave_page_range(r_start, r_end-1); - if (r_end >= max_low_pfn) - break; - r_start = end; - } -} - -static void __init e820_save_acpi_pages(void) -{ - int i; - - /* Assume e820 map is sorted */ - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = ei->addr; - end = ei->addr + ei->size; - if (start >= end) - continue; - if (ei->type != E820_ACPI && ei->type != E820_NVS) - continue; - /* - * If the region is below max_low_pfn, it will be - * saved/restored by swsusp follow 'RAM' type. - */ - if (start < (max_low_pfn << PAGE_SHIFT)) - start = max_low_pfn << PAGE_SHIFT; - /* - * Highmem pages (ACPI NVS/Data) are reserved, but swsusp - * highmem save/restore will not save/restore them. We marked - * them as arch saveable pages here - */ - if (end > start) - swsusp_add_arch_pages(start, end); - } -} - -extern char __start_rodata, __end_rodata; -/* - * BIOS reserved region/hole - no save/restore - * ACPI NVS - save/restore - * ACPI Data - this is a little tricky, the mem could be used by OS after OS - * reads tables from the region, but anyway save/restore the memory hasn't any - * side effect and Linux runtime module load/unload might use it. - * kernel rodata - no save/restore (kernel rodata isn't changed) - */ -static int __init mark_nosave_pages(void) -{ - unsigned long pfn_start, pfn_end; - - /* FIXME: provide a version for efi BIOS */ - if (efi_enabled) - return 0; - /* BIOS reserved regions & holes */ - e820_nosave_reserved_pages(); - - /* kernel rodata */ - pfn_start = PFN_UP(virt_to_phys(&__start_rodata)); - pfn_end = PFN_DOWN(virt_to_phys(&__end_rodata)); - mark_nosave_page_range(pfn_start, pfn_end-1); - - /* record ACPI Data/NVS as saveable */ - e820_save_acpi_pages(); - - return 0; -} -core_initcall(mark_nosave_pages); -#endif - /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -1609,17 +1399,15 @@ void __init setup_arch(char **cmdline_p) data_resource.start = virt_to_phys(_etext); data_resource.end = virt_to_phys(_edata)-1; - parse_cmdline_early(cmdline_p); + parse_early_param(); -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - setup_early_printk(strchr(s, '=') + 1); - printk("early console enabled\n"); - } + if (user_defined_memmap) { + printk(KERN_INFO "user-defined physical RAM map:\n"); + print_memory_map("user"); } -#endif + + strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; max_low_pfn = setup_memory(); @@ -1648,7 +1436,7 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); #ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); + generic_apic_probe(); #endif if (efi_enabled) efi_map_memmap(); @@ -1660,9 +1448,11 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif +#ifdef CONFIG_PCI #ifdef CONFIG_X86_IO_APIC check_acpi_pci(); /* Checks more than just ACPI actually */ #endif +#endif #ifdef CONFIG_ACPI acpi_boot_init(); @@ -1689,6 +1479,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; #endif #endif + tsc_init(); } static __init int add_pcspkr(void) @@ -1708,7 +1499,6 @@ static __init int add_pcspkr(void) } device_initcall(add_pcspkr); -#include "setup_arch_post.h" /* * Local Variables: * mode:c diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 5c352c3a9e7f..43002cfb40c4 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; } - restorer = &__kernel_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = &__kernel_rt_sigreturn; + restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index d134e9643a58..465188e2d701 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m static inline int __prepare_ICR (unsigned int shortcut, int vector) { - return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; + unsigned int icr = shortcut | APIC_DEST_LOGICAL; + + switch (vector) { + default: + icr |= APIC_DM_FIXED | vector; + break; + case NMI_VECTOR: + icr |= APIC_DM_NMI; + break; + } + return icr; } static inline int __prepare_ICR2 (unsigned int mask) @@ -624,3 +634,69 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs) } } +/* + * this function sends a 'generic call function' IPI to one other CPU + * in the system. + * + * cpu is a standard Linux logical CPU number. + */ +static void +__smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + struct call_data_struct data; + int cpus = 1; + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + call_data = &data; + wmb(); + /* Send a message to all other CPUs and wait for them to respond */ + send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) + cpu_relax(); + + if (!wait) + return; + + while (atomic_read(&data.finished) != cpus) + cpu_relax(); +} + +/* + * smp_call_function_single - Run a function on another CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: Currently unused. + * @wait: If true, wait until function has completed on other CPUs. + * + * Retrurns 0 on success, else a negative status code. + * + * Does not return until the remote CPU is nearly ready to execute <func> + * or is or has executed. + */ + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int nonatomic, int wait) +{ + /* prevent preemption and reschedule on another processor */ + int me = get_cpu(); + if (cpu == me) { + WARN_ON(1); + put_cpu(); + return -EBUSY; + } + spin_lock_bh(&call_lock); + __smp_call_function_single(cpu, func, info, nonatomic, wait); + spin_unlock_bh(&call_lock); + put_cpu(); + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 825b2b4ca721..82b26d5ce476 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -34,7 +34,6 @@ * Rusty Russell : Hacked into shape for new "hotplug" boot process. */ #include <linux/module.h> -#include <linux/config.h> #include <linux/init.h> #include <linux/kernel.h> @@ -52,6 +51,7 @@ #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/arch_hooks.h> +#include <asm/nmi.h> #include <mach_apic.h> #include <mach_wakecpu.h> @@ -66,12 +66,6 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); #endif -/* Package ID of each logical CPU */ -int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; - -/* Core ID of each logical CPU */ -int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; - /* Last level cache ID of each logical CPU */ int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; @@ -108,6 +102,8 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 apicid_2_node[MAX_APICID]; + /* * Trampoline 80x86 program as an array. */ @@ -183,6 +179,9 @@ static void __devinit smp_store_cpu_info(int id) */ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + if (num_possible_cpus() == 1) + goto valid_k7; + /* Athlon 660/661 is valid. */ if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) goto valid_k7; @@ -218,14 +217,20 @@ valid_k7: * then we print a warning if not, and always resync. */ -static atomic_t tsc_start_flag = ATOMIC_INIT(0); -static atomic_t tsc_count_start = ATOMIC_INIT(0); -static atomic_t tsc_count_stop = ATOMIC_INIT(0); -static unsigned long long tsc_values[NR_CPUS]; +static struct { + atomic_t start_flag; + atomic_t count_start; + atomic_t count_stop; + unsigned long long values[NR_CPUS]; +} tsc __initdata = { + .start_flag = ATOMIC_INIT(0), + .count_start = ATOMIC_INIT(0), + .count_stop = ATOMIC_INIT(0), +}; #define NR_LOOPS 5 -static void __init synchronize_tsc_bp (void) +static void __init synchronize_tsc_bp(void) { int i; unsigned long long t0; @@ -239,7 +244,7 @@ static void __init synchronize_tsc_bp (void) /* convert from kcyc/sec to cyc/usec */ one_usec = cpu_khz / 1000; - atomic_set(&tsc_start_flag, 1); + atomic_set(&tsc.start_flag, 1); wmb(); /* @@ -256,16 +261,16 @@ static void __init synchronize_tsc_bp (void) /* * all APs synchronize but they loop on '== num_cpus' */ - while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_stop, 0); + while (atomic_read(&tsc.count_start) != num_booting_cpus()-1) + cpu_relax(); + atomic_set(&tsc.count_stop, 0); wmb(); /* * this lets the APs save their current TSC: */ - atomic_inc(&tsc_count_start); + atomic_inc(&tsc.count_start); - rdtscll(tsc_values[smp_processor_id()]); + rdtscll(tsc.values[smp_processor_id()]); /* * We clear the TSC in the last loop: */ @@ -275,56 +280,54 @@ static void __init synchronize_tsc_bp (void) /* * Wait for all APs to leave the synchronization point: */ - while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) - mb(); - atomic_set(&tsc_count_start, 0); + while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1) + cpu_relax(); + atomic_set(&tsc.count_start, 0); wmb(); - atomic_inc(&tsc_count_stop); + atomic_inc(&tsc.count_stop); } sum = 0; for (i = 0; i < NR_CPUS; i++) { if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc_values[i]; + t0 = tsc.values[i]; sum += t0; } } avg = sum; do_div(avg, num_booting_cpus()); - sum = 0; for (i = 0; i < NR_CPUS; i++) { if (!cpu_isset(i, cpu_callout_map)) continue; - delta = tsc_values[i] - avg; + delta = tsc.values[i] - avg; if (delta < 0) delta = -delta; /* * We report bigger than 2 microseconds clock differences. */ if (delta > 2*one_usec) { - long realdelta; + long long realdelta; + if (!buggy) { buggy = 1; printk("\n"); } realdelta = delta; do_div(realdelta, one_usec); - if (tsc_values[i] < avg) + if (tsc.values[i] < avg) realdelta = -realdelta; - if (realdelta > 0) - printk(KERN_INFO "CPU#%d had %ld usecs TSC " + if (realdelta) + printk(KERN_INFO "CPU#%d had %Ld usecs TSC " "skew, fixed it up.\n", i, realdelta); } - - sum += delta; } if (!buggy) printk("passed.\n"); } -static void __init synchronize_tsc_ap (void) +static void __init synchronize_tsc_ap(void) { int i; @@ -333,19 +336,21 @@ static void __init synchronize_tsc_ap (void) * this gets called, so we first wait for the BP to * finish SMP initialization: */ - while (!atomic_read(&tsc_start_flag)) mb(); + while (!atomic_read(&tsc.start_flag)) + cpu_relax(); for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != num_booting_cpus()) - mb(); + atomic_inc(&tsc.count_start); + while (atomic_read(&tsc.count_start) != num_booting_cpus()) + cpu_relax(); - rdtscll(tsc_values[smp_processor_id()]); + rdtscll(tsc.values[smp_processor_id()]); if (i == NR_LOOPS-1) write_tsc(0, 0); - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + atomic_inc(&tsc.count_stop); + while (atomic_read(&tsc.count_stop) != num_booting_cpus()) + cpu_relax(); } } #undef NR_LOOPS @@ -451,10 +456,12 @@ cpumask_t cpu_coregroup_map(int cpu) struct cpuinfo_x86 *c = cpu_data + cpu; /* * For perf, we return last level cache shared map. - * TBD: when power saving sched policy is added, we will return - * cpu_core_map when power saving policy is enabled + * And for power savings, we return cpu_core_map */ - return c->llc_shared_map; + if (sched_mc_power_savings || sched_smt_power_savings) + return cpu_core_map[cpu]; + else + return c->llc_shared_map; } /* representing cpus for which sibling maps can be computed */ @@ -470,8 +477,8 @@ set_cpu_sibling_map(int cpu) if (smp_num_siblings > 1) { for_each_cpu_mask(i, cpu_sibling_setup_map) { - if (phys_proc_id[cpu] == phys_proc_id[i] && - cpu_core_id[cpu] == cpu_core_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id && + c[cpu].cpu_core_id == c[i].cpu_core_id) { cpu_set(i, cpu_sibling_map[cpu]); cpu_set(cpu, cpu_sibling_map[i]); cpu_set(i, cpu_core_map[cpu]); @@ -498,7 +505,7 @@ set_cpu_sibling_map(int cpu) cpu_set(i, c[cpu].llc_shared_map); cpu_set(cpu, c[i].llc_shared_map); } - if (phys_proc_id[cpu] == phys_proc_id[i]) { + if (c[cpu].phys_proc_id == c[i].phys_proc_id) { cpu_set(i, cpu_core_map[cpu]); cpu_set(cpu, cpu_core_map[i]); /* @@ -640,9 +647,13 @@ static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); int apicid = logical_smp_processor_id(); + int node = apicid_to_node(hard_smp_processor_id()); + + if (!node_online(node)) + node = first_online_node; cpu_2_logical_apicid[cpu] = apicid; - map_cpu_to_node(cpu, apicid_to_node(apicid)); + map_cpu_to_node(cpu, node); } static void unmap_cpu_to_logical_apicid(int cpu) @@ -945,6 +956,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu) irq_ctx_init(cpu); + x86_cpu_to_apicid[cpu] = apicid; /* * This grunge runs the startup process for * the targeted processor. @@ -1053,6 +1065,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu) struct warm_boot_cpu_info info; struct work_struct task; int apicid, ret; + struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); apicid = x86_cpu_to_apicid[cpu]; if (apicid == BAD_APICID) { @@ -1060,6 +1073,18 @@ static int __cpuinit __smp_prepare_cpu(int cpu) goto exit; } + /* + * the CPU isn't initialized at boot time, allocate gdt table here. + * cpu_init will initialize it + */ + if (!cpu_gdt_descr->address) { + cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL); + if (!cpu_gdt_descr->address) + printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); + ret = -ENOMEM; + goto exit; + } + info.complete = &done; info.apicid = apicid; info.cpu = cpu; @@ -1337,8 +1362,8 @@ remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_map[sibling]); cpus_clear(cpu_sibling_map[cpu]); cpus_clear(cpu_core_map[cpu]); - phys_proc_id[cpu] = BAD_APICID; - cpu_core_id[cpu] = BAD_APICID; + c[cpu].phys_proc_id = 0; + c[cpu].cpu_core_id = 0; cpu_clear(cpu, cpu_sibling_setup_map); } @@ -1357,7 +1382,8 @@ int __cpu_disable(void) */ if (cpu == 0) return -EBUSY; - + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); clear_local_APIC(); /* Allow any queued timer interrupts to get serviced */ local_irq_enable(); @@ -1433,7 +1459,7 @@ int __devinit __cpu_up(unsigned int cpu) /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) - mb(); + cpu_relax(); return 0; } @@ -1471,3 +1497,16 @@ void __init smp_intr_init(void) /* IPI for generic function call */ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); } + +/* + * If the BIOS enumerates physical processors before logical, + * maxcpus=N at enumeration-time can be used to disable HT. + */ +static int __init parse_maxcpus(char *arg) +{ + extern unsigned int maxcpus; + + maxcpus = simple_strtoul(arg, NULL, 0); + return 0; +} +early_param("maxcpus", parse_maxcpus); diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c index 989c85255dbe..f7e735c077c3 100644 --- a/arch/i386/kernel/srat.c +++ b/arch/i386/kernel/srat.c @@ -23,7 +23,6 @@ * * Send feedback to Pat Gaughen <gone@us.ibm.com> */ -#include <linux/config.h> #include <linux/mm.h> #include <linux/bootmem.h> #include <linux/mmzone.h> @@ -31,6 +30,7 @@ #include <linux/nodemask.h> #include <asm/srat.h> #include <asm/topology.h> +#include <asm/smp.h> /* * proximity macros and definitions @@ -43,7 +43,7 @@ #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ -#define MAX_CHUNKS_PER_NODE 4 +#define MAX_CHUNKS_PER_NODE 3 #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) struct node_memory_chunk_s { unsigned long start_pfn; @@ -55,8 +55,7 @@ struct node_memory_chunk_s { static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; static int num_memory_chunks; /* total number of memory chunks */ -static int zholes_size_init; -static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; +static u8 __initdata apicid_to_pxm[MAX_APICID]; extern void * boot_ioremap(unsigned long, unsigned long); @@ -72,6 +71,8 @@ static void __init parse_cpu_affinity_structure(char *p) /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); + apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain; + printk("CPU 0x%02X in proximity domain 0x%02X\n", cpu_affinity->apic_id, cpu_affinity->proximity_domain); } @@ -136,50 +137,6 @@ static void __init parse_memory_affinity_structure (char *sratp) "enabled and removable" : "enabled" ) ); } -#if MAX_NR_ZONES != 4 -#error "MAX_NR_ZONES != 4, chunk_to_zone requires review" -#endif -/* Take a chunk of pages from page frame cstart to cend and count the number - * of pages in each zone, returned via zones[]. - */ -static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, - unsigned long *zones) -{ - unsigned long max_dma; - extern unsigned long max_low_pfn; - - int z; - unsigned long rend; - - /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide - * similarly scoped information and should be handled in a consistant - * manner. - */ - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - - /* Split the hole into the zones in which it falls. Repeatedly - * take the segment in which the remaining hole starts, round it - * to the end of that zone. - */ - memset(zones, 0, MAX_NR_ZONES * sizeof(long)); - while (cstart < cend) { - if (cstart < max_dma) { - z = ZONE_DMA; - rend = (cend < max_dma)? cend : max_dma; - - } else if (cstart < max_low_pfn) { - z = ZONE_NORMAL; - rend = (cend < max_low_pfn)? cend : max_low_pfn; - - } else { - z = ZONE_HIGHMEM; - rend = cend; - } - zones[z] += rend - cstart; - cstart = rend; - } -} - /* * The SRAT table always lists ascending addresses, so can always * assume that the first "start" address that you see is the real @@ -224,7 +181,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); - memset(zholes_size, 0, sizeof(zholes_size)); num_memory_chunks = 0; while (p < end) { @@ -283,11 +239,15 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) printk("Number of logical nodes in system = %d\n", num_online_nodes()); printk("Number of memory chunks in system = %d\n", num_memory_chunks); + for (i = 0; i < MAX_APICID; i++) + apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); + for (j = 0; j < num_memory_chunks; j++){ struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); node_read_chunk(chunk->nid, chunk); + add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn); } for_each_online_node(nid) { @@ -396,57 +356,7 @@ int __init get_memcfg_from_srat(void) return acpi20_parse_srat((struct acpi_table_srat *)header); } out_err: + remove_all_active_ranges(); printk("failed to get NUMA memory information from SRAT table\n"); return 0; } - -/* For each node run the memory list to determine whether there are - * any memory holes. For each hole determine which ZONE they fall - * into. - * - * NOTE#1: this requires knowledge of the zone boundries and so - * _cannot_ be performed before those are calculated in setup_memory. - * - * NOTE#2: we rely on the fact that the memory chunks are ordered by - * start pfn number during setup. - */ -static void __init get_zholes_init(void) -{ - int nid; - int c; - int first; - unsigned long end = 0; - - for_each_online_node(nid) { - first = 1; - for (c = 0; c < num_memory_chunks; c++){ - if (node_memory_chunk[c].nid == nid) { - if (first) { - end = node_memory_chunk[c].end_pfn; - first = 0; - - } else { - /* Record any gap between this chunk - * and the previous chunk on this node - * against the zones it spans. - */ - chunk_to_zones(end, - node_memory_chunk[c].start_pfn, - &zholes_size[nid * MAX_NR_ZONES]); - } - } - } - } -} - -unsigned long * __init get_zholes_size(int nid) -{ - if (!zholes_size_init) { - zholes_size_init++; - get_zholes_init(); - } - if (nid >= MAX_NUMNODES || !node_online(nid)) - printk("%s: nid = %d is invalid/offline. num_online_nodes = %d", - __FUNCTION__, nid, num_online_nodes()); - return &zholes_size[nid * MAX_NR_ZONES]; -} diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index dd63d4775398..7e639f78b0b9 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -317,3 +317,4 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_getcpu diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 0bada1870bdf..713ba39d32c6 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -2,6 +2,8 @@ * linux/arch/i386/kernel/sysenter.c * * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar * * This file contains the needed initializations to support sysenter. */ @@ -13,12 +15,31 @@ #include <linux/gfp.h> #include <linux/string.h> #include <linux/elf.h> +#include <linux/mm.h> +#include <linux/module.h> #include <asm/cpufeature.h> #include <asm/msr.h> #include <asm/pgtable.h> #include <asm/unistd.h> +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso_enabled = 1; + +EXPORT_SYMBOL_GPL(vdso_enabled); + +static int __init vdso_setup(char *s) +{ + vdso_enabled = simple_strtoul(s, NULL, 0); + + return 1; +} + +__setup("vdso=", vdso_setup); + extern asmlinkage void sysenter_entry(void); void enable_sep_cpu(void) @@ -45,23 +66,120 @@ void enable_sep_cpu(void) */ extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; +static void *syscall_page; int __init sysenter_setup(void) { - void *page = (void *)get_zeroed_page(GFP_ATOMIC); + syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); - __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC); +#ifdef CONFIG_COMPAT_VDSO + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); + printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#else + /* + * In the non-compat case the ELF coredumping code needs the fixmap: + */ + __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO); +#endif if (!boot_cpu_has(X86_FEATURE_SEP)) { - memcpy(page, + memcpy(syscall_page, &vsyscall_int80_start, &vsyscall_int80_end - &vsyscall_int80_start); return 0; } - memcpy(page, + memcpy(syscall_page, &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); return 0; } + +static struct page *syscall_nopage(struct vm_area_struct *vma, + unsigned long adr, int *type) +{ + struct page *p = virt_to_page(adr - vma->vm_start + syscall_page); + get_page(p); + return p; +} + +/* Prevent VMA merging */ +static void syscall_vma_close(struct vm_area_struct *vma) +{ +} + +static struct vm_operations_struct syscall_vm_ops = { + .close = syscall_vma_close, + .nopage = syscall_nopage, +}; + +/* Defined in vsyscall-sysenter.S */ +extern void SYSENTER_RETURN; + +/* Setup a VMA at program startup for the vsyscall page */ +int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr; + int ret; + + down_write(&mm->mmap_sem); + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + ret = -ENOMEM; + goto up_fail; + } + + vma->vm_start = addr; + vma->vm_end = addr + PAGE_SIZE; + /* MAYWRITE to allow gdb to COW and set breakpoints */ + vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE; + vma->vm_flags |= mm->def_flags; + vma->vm_page_prot = protection_map[vma->vm_flags & 7]; + vma->vm_ops = &syscall_vm_ops; + vma->vm_mm = mm; + + ret = insert_vm_struct(mm, vma); + if (unlikely(ret)) { + kmem_cache_free(vm_area_cachep, vma); + goto up_fail; + } + + current->mm->context.vdso = (void *)addr; + current_thread_info()->sysenter_return = + (void *)VDSO_SYM(&SYSENTER_RETURN); + mm->total_vm++; +up_fail: + up_write(&mm->mmap_sem); + return ret; +} + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +{ + return NULL; +} + +int in_gate_area(struct task_struct *task, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_task(unsigned long addr) +{ + return 0; +} diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 9d3074759856..86944acfb647 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -60,7 +60,6 @@ #include "mach_time.h" #include <linux/timex.h> -#include <linux/config.h> #include <asm/hpet.h> @@ -82,13 +81,6 @@ extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -#include <asm/i8253.h> - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -struct timer_opts *cur_timer __read_mostly = &timer_none; - /* * This is a special lock that is owned by the CPU and holds the index * register we are working with. It is required for NMI access to the @@ -118,99 +110,19 @@ void rtc_cmos_write(unsigned char val, unsigned char addr) } EXPORT_SYMBOL(rtc_cmos_write); -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= cur_timer->get_offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int set_rtc_mmss(unsigned long nowtime) { int retval; - - WARN_ON(irqs_disabled()); + unsigned long flags; /* gets recalled with irq locally disabled */ - spin_lock_irq(&rtc_lock); + /* XXX - does irqsave resolve this? -johnstul */ + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_set_rtc_mmss(nowtime); else retval = mach_set_rtc_mmss(nowtime); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } @@ -218,35 +130,50 @@ static int set_rtc_mmss(unsigned long nowtime) int timer_ack; -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (in_lock_functions(pc)) +#ifdef CONFIG_SMP + if (!user_mode_vm(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->ebp + 4); - +#else + unsigned long *sp; + if ((regs->xcs & 3) == 0) + sp = (unsigned long *)®s->esp; + else + sp = (unsigned long *)regs->esp; + /* Return address is either directly at stack pointer + or above a saved eflags. Eflags has bits 22-31 zero, + kernel addresses don't. */ + if (sp[0] >> 22) + return sp[0]; + if (sp[1] >> 22) + return sp[1]; +#endif + } +#endif return pc; } EXPORT_SYMBOL(profile_pc); -#endif /* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick + * This is the same as the above, except we _also_ save the current + * Time Stamp Counter value at the time of the timer interrupt, so that + * we later on can estimate the time of day more exactly. */ -static inline void do_timer_interrupt(int irq, struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { + /* + * Here we are in the timer irq handler. We just have irqs locally + * disabled but we don't know if the timer_bh is running on the other + * CPU. We need to avoid to SMP race with it. NOTE: we don' t need + * the irq version of write_lock because as just said we have irq + * locally disabled. -arca + */ + write_seqlock(&xtime_lock); + #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -279,27 +206,6 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs) irq = inb_p( 0x61 ); /* read the current state */ outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - - cur_timer->mark_offset(); - - do_timer_interrupt(irq, regs); write_sequnlock(&xtime_lock); @@ -315,15 +221,16 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) unsigned long get_cmos_time(void) { unsigned long retval; + unsigned long flags; - spin_lock(&rtc_lock); + spin_lock_irqsave(&rtc_lock, flags); if (efi_enabled) retval = efi_get_time(); else retval = mach_get_cmos_time(); - spin_unlock(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return retval; } @@ -378,21 +285,19 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff, sleep_start; +static long clock_cmos_diff; +static unsigned long sleep_start; -static struct timer_opts *last_timer; static int timer_suspend(struct sys_device *dev, pm_message_t state) { /* * Estimate time zone so that set_time can update the clock */ - clock_cmos_diff = -get_cmos_time(); + unsigned long ctime = get_cmos_time(); + + clock_cmos_diff = -ctime; clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); - last_timer = cur_timer; - cur_timer = &timer_none; - if (last_timer->suspend) - last_timer->suspend(state); + sleep_start = ctime; return 0; } @@ -400,25 +305,32 @@ static int timer_resume(struct sys_device *dev) { unsigned long flags; unsigned long sec; - unsigned long sleep_length; - + unsigned long ctime = get_cmos_time(); + long sleep_length = (ctime - sleep_start) * HZ; + struct timespec ts; + + if (sleep_length < 0) { + printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n"); + /* The time after the resume must not be earlier than the time + * before the suspend or some nasty things will happen + */ + sleep_length = 0; + ctime = sleep_start; + } #ifdef CONFIG_HPET_TIMER if (is_hpet_enabled()) hpet_reenable(); #endif setup_pit_timer(); - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; + + sec = ctime + clock_cmos_diff; + ts.tv_sec = sec; + ts.tv_nsec = 0; + do_settimeofday(&ts); write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; jiffies_64 += sleep_length; wall_jiffies += sleep_length; write_sequnlock_irqrestore(&xtime_lock, flags); - if (last_timer->resume) - last_timer->resume(); - cur_timer = last_timer; - last_timer = NULL; touch_softlockup_watchdog(); return 0; } @@ -451,24 +363,23 @@ extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + struct timespec ts; + ts.tv_sec = get_cmos_time(); + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); + + do_settimeofday(&ts); if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - time_init_hook(); } #endif void __init time_init(void) { + struct timespec ts; #ifdef CONFIG_HPET_TIMER if (is_hpet_capable()) { /* @@ -479,13 +390,10 @@ void __init time_init(void) return; } #endif - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + ts.tv_sec = get_cmos_time(); + ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); + do_settimeofday(&ts); time_init_hook(); } diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c index a529f0cdce17..6bf14a4e995e 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/i386/kernel/time_hpet.c @@ -18,7 +18,6 @@ #include <asm/apic.h> #include <linux/timex.h> -#include <linux/config.h> #include <asm/hpet.h> #include <linux/hpet.h> @@ -302,23 +301,25 @@ int hpet_rtc_timer_init(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); hpet_writel(cnt, HPET_T1_CMP); hpet_t1_cmp = cnt; - local_irq_restore(flags); cfg = hpet_readl(HPET_T1_CFG); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; hpet_writel(cfg, HPET_T1_CFG); + local_irq_restore(flags); + return 1; } static void hpet_rtc_timer_reinit(void) { - unsigned int cfg, cnt; + unsigned int cfg, cnt, ticks_per_int, lost_ints; if (unlikely(!(PIE_on | AIE_on | UIE_on))) { cfg = hpet_readl(HPET_T1_CFG); @@ -333,10 +334,33 @@ static void hpet_rtc_timer_reinit(void) hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_t1_cmp; - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; + ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq; + hpet_t1_cmp += ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + /* + * If the interrupt handler was delayed too long, the write above tries + * to schedule the next interrupt in the past and the hardware would + * not interrupt until the counter had wrapped around. + * So we have to check that the comparator wasn't set to a past time. + */ + cnt = hpet_readl(HPET_COUNTER); + if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) { + lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1; + /* Make sure that, even with the time needed to execute + * this code, the next scheduled interrupt has been moved + * back to the future: */ + lost_ints++; + + hpet_t1_cmp += lost_ints * ticks_per_int; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + + if (PIE_on) + PIE_count += lost_ints; + + printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", + hpet_rtc_int_freq); + } } /* diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile deleted file mode 100644 index 8fa12be658dd..000000000000 --- a/arch/i386/kernel/timers/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for x86 timers -# - -obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o - -obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o -obj-$(CONFIG_HPET_TIMER) += timer_hpet.o -obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c deleted file mode 100644 index 8163fe0cf1f0..000000000000 --- a/arch/i386/kernel/timers/common.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Common functions used across the timers go here - */ - -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/jiffies.h> -#include <linux/module.h> - -#include <asm/io.h> -#include <asm/timer.h> -#include <asm/hpet.h> - -#include "mach_timer.h" - -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ - -#define CALIBRATE_TIME (5 * 1000020/HZ) - -unsigned long calibrate_tsc(void) -{ - mach_prepare_counter(); - - { - unsigned long startlow, starthigh; - unsigned long endlow, endhigh; - unsigned long count; - - rdtsc(startlow,starthigh); - mach_countup(&count); - rdtsc(endlow,endhigh); - - - /* Error: ECTCNEVERSET */ - if (count <= 1) - goto bad_ctc; - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (endlow), "=d" (endhigh) - :"g" (startlow), "g" (starthigh), - "0" (endlow), "1" (endhigh)); - - /* Error: ECPUTOOFAST */ - if (endhigh) - goto bad_ctc; - - /* Error: ECPUTOOSLOW */ - if (endlow <= CALIBRATE_TIME) - goto bad_ctc; - - __asm__("divl %2" - :"=a" (endlow), "=d" (endhigh) - :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME)); - - return endlow; - } - - /* - * The CTC wasn't reliable: we got a hit on the very first read, - * or the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ -bad_ctc: - return 0; -} - -#ifdef CONFIG_HPET_TIMER -/* ------ Calibrate the TSC using HPET ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq. - * Second output is parameter 1 (when non NULL) - * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet(). - * calibrate_tsc() calibrates the processor TSC by comparing - * it to the HPET timer of known frequency. - * Too much 64-bit arithmetic here to do this cleanly in C - */ -#define CALIBRATE_CNT_HPET (5 * hpet_tick) -#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) - -unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) -{ - unsigned long tsc_startlow, tsc_starthigh; - unsigned long tsc_endlow, tsc_endhigh; - unsigned long hpet_start, hpet_end; - unsigned long result, remain; - - hpet_start = hpet_readl(HPET_COUNTER); - rdtsc(tsc_startlow, tsc_starthigh); - do { - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET); - rdtsc(tsc_endlow, tsc_endhigh); - - /* 64-bit subtract - gcc just messes up with long longs */ - __asm__("subl %2,%0\n\t" - "sbbl %3,%1" - :"=a" (tsc_endlow), "=d" (tsc_endhigh) - :"g" (tsc_startlow), "g" (tsc_starthigh), - "0" (tsc_endlow), "1" (tsc_endhigh)); - - /* Error: ECPUTOOFAST */ - if (tsc_endhigh) - goto bad_calibration; - - /* Error: ECPUTOOSLOW */ - if (tsc_endlow <= CALIBRATE_TIME_HPET) - goto bad_calibration; - - ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET); - if (remain > (tsc_endlow >> 1)) - result++; /* rounding the result */ - - if (tsc_hpet_quotient_ptr) { - unsigned long tsc_hpet_quotient; - - ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0, - CALIBRATE_CNT_HPET); - if (remain > (tsc_endlow >> 1)) - tsc_hpet_quotient++; /* rounding the result */ - *tsc_hpet_quotient_ptr = tsc_hpet_quotient; - } - - return result; -bad_calibration: - /* - * the CPU was so fast/slow that the quotient wouldn't fit in - * 32 bits.. - */ - return 0; -} -#endif - - -unsigned long read_timer_tsc(void) -{ - unsigned long retval; - rdtscl(retval); - return retval; -} - - -/* calculate cpu_khz */ -void init_cpu_khz(void) -{ - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc(); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - } - } -} - diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c deleted file mode 100644 index 7e39ed8e33f8..000000000000 --- a/arch/i386/kernel/timers/timer.c +++ /dev/null @@ -1,75 +0,0 @@ -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <asm/timer.h> - -#ifdef CONFIG_HPET_TIMER -/* - * HPET memory read is slower than tsc reads, but is more dependable as it - * always runs at constant frequency and reduces complexity due to - * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use - * timer_pit when HPET is active. So, we default to timer_tsc. - */ -#endif -/* list of timers, ordered by preference, NULL terminated */ -static struct init_timer_opts* __initdata timers[] = { -#ifdef CONFIG_X86_CYCLONE_TIMER - &timer_cyclone_init, -#endif -#ifdef CONFIG_HPET_TIMER - &timer_hpet_init, -#endif -#ifdef CONFIG_X86_PM_TIMER - &timer_pmtmr_init, -#endif - &timer_tsc_init, - &timer_pit_init, - NULL, -}; - -static char clock_override[10] __initdata; - -static int __init clock_setup(char* str) -{ - if (str) - strlcpy(clock_override, str, sizeof(clock_override)); - return 1; -} -__setup("clock=", clock_setup); - - -/* The chosen timesource has been found to be bad. - * Fall back to a known good timesource (the PIT) - */ -void clock_fallback(void) -{ - cur_timer = &timer_pit; -} - -/* iterates through the list of timers, returning the first - * one that initializes successfully. - */ -struct timer_opts* __init select_timer(void) -{ - int i = 0; - - /* find most preferred working timer */ - while (timers[i]) { - if (timers[i]->init) - if (timers[i]->init(clock_override) == 0) - return timers[i]->opts; - ++i; - } - - panic("select_timer: Cannot find a suitable timer\n"); - return NULL; -} - -int read_current_timer(unsigned long *timer_val) -{ - if (cur_timer->read_timer) { - *timer_val = cur_timer->read_timer(); - return 0; - } - return -1; -} diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c deleted file mode 100644 index 13892a65c941..000000000000 --- a/arch/i386/kernel/timers/timer_cyclone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Cyclone-timer: - * This code implements timer_ops for the cyclone counter found - * on IBM x440, x360, and other Summit based systems. - * - * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com) - */ - - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/i8253.h> - -#include "io_ports.h" - -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -#define CYCLONE_CBAR_ADDR 0xFEB00CD0 -#define CYCLONE_PMCC_OFFSET 0x51A0 -#define CYCLONE_MPMC_OFFSET 0x51D0 -#define CYCLONE_MPCS_OFFSET 0x51A8 -#define CYCLONE_TIMER_FREQ 100000000 -#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */ -int use_cyclone = 0; - -static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */ -static u32 last_cyclone_low; -static u32 last_cyclone_high; -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* helper macro to atomically read both cyclone counter registers */ -#define read_cyclone_counter(low,high) \ - do{ \ - high = cyclone_timer[1]; low = cyclone_timer[0]; \ - } while (high != cyclone_timer[1]); - - -static void mark_offset_cyclone(void) -{ - unsigned long lost, delay; - unsigned long delta = last_cyclone_low; - int count; - unsigned long long this_offset, last_offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - - spin_lock(&i8253_lock); - read_cyclone_counter(last_cyclone_low,last_cyclone_high); - - /* read values for delay_at_last_interrupt */ - outb_p(0x00, 0x43); /* latch the count ASAP */ - - count = inb_p(0x40); /* read the latched count */ - count |= inb(0x40) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - spin_unlock(&i8253_lock); - - /* lost tick compensation */ - delta = last_cyclone_low - delta; - delta /= (CYCLONE_TIMER_FREQ/1000000); - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2) - jiffies_64 += lost-1; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK; - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - - /* catch corner case where tick rollover occured - * between cyclone and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static unsigned long get_offset_cyclone(void) -{ - u32 offset; - - if(!cyclone_timer) - return delay_at_last_interrupt; - - /* Read the cyclone timer */ - offset = cyclone_timer[0]; - - /* .. relative to previous jiffy */ - offset = offset - last_cyclone_low; - - /* convert cyclone ticks to microseconds */ - /* XXX slow, can we speed this up? */ - offset = offset/(CYCLONE_TIMER_FREQ/1000000); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + offset; -} - -static unsigned long long monotonic_clock_cyclone(void) -{ - u32 now_low, now_high; - unsigned long long last_offset, this_offset, base; - unsigned long long ret; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - - /* Read the cyclone counter */ - read_cyclone_counter(now_low,now_high); - this_offset = ((unsigned long long)now_high<<32)|now_low; - - /* convert to nanoseconds */ - ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK); - return ret * (1000000000 / CYCLONE_TIMER_FREQ); -} - -static int __init init_cyclone(char* override) -{ - u32* reg; - u32 base; /* saved cyclone base address */ - u32 pageaddr; /* page that contains cyclone_timer register */ - u32 offset; /* offset from pageaddr to cyclone_timer register */ - int i; - - /* check clock override */ - if (override[0] && strncmp(override,"cyclone",7)) - return -ENODEV; - - /*make sure we're on a summit box*/ - if(!use_cyclone) return -ENODEV; - - printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n"); - - /* find base address */ - pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK; - offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n"); - return -ENODEV; - } - base = *reg; - if(!base){ - printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n"); - return -ENODEV; - } - - /* setup PMCC */ - pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* setup MPCS */ - pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!reg){ - printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n"); - return -ENODEV; - } - reg[0] = 0x00000001; - - /* map in cyclone_timer */ - pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK; - offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK); - set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr); - cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset); - if(!cyclone_timer){ - printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n"); - return -ENODEV; - } - - /*quick test to make sure its ticking*/ - for(i=0; i<3; i++){ - u32 old = cyclone_timer[0]; - int stall = 100; - while(stall--) barrier(); - if(cyclone_timer[0] == old){ - printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n"); - cyclone_timer = 0; - return -ENODEV; - } - } - - init_cpu_khz(); - - /* Everything looks good! */ - return 0; -} - - -static void delay_cyclone(unsigned long loops) -{ - unsigned long bclock, now; - if(!cyclone_timer) - return; - bclock = cyclone_timer[0]; - do { - rep_nop(); - now = cyclone_timer[0]; - } while ((now-bclock) < loops); -} -/************************************************************/ - -/* cyclone timer_opts struct */ -static struct timer_opts timer_cyclone = { - .name = "cyclone", - .mark_offset = mark_offset_cyclone, - .get_offset = get_offset_cyclone, - .monotonic_clock = monotonic_clock_cyclone, - .delay = delay_cyclone, -}; - -struct init_timer_opts __initdata timer_cyclone_init = { - .init = init_cyclone, - .opts = &timer_cyclone, -}; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c deleted file mode 100644 index 17a6fe7166e7..000000000000 --- a/arch/i386/kernel/timers/timer_hpet.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" -#include <asm/hpet.h> - -static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */ -static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */ -static unsigned long hpet_last; /* hpet counter value at last tick*/ -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static unsigned long long monotonic_clock_hpet(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -static unsigned long get_offset_hpet(void) -{ - register unsigned long eax, edx; - - eax = hpet_readl(HPET_COUNTER); - eax -= hpet_last; /* hpet delta */ - eax = min(hpet_tick, eax); - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - * - * Using a mull instead of a divl saves some cycles in critical path. - */ - ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static void mark_offset_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - offset = hpet_readl(HPET_COUNTER); - if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) { - int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1; - jiffies_64 += lost_ticks; - } - hpet_last = offset; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); -} - -static void delay_hpet(unsigned long loops) -{ - unsigned long hpet_start, hpet_end; - unsigned long eax; - - /* loops is the number of cpu cycles. Convert it to hpet clocks */ - ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops); - - hpet_start = hpet_readl(HPET_COUNTER); - do { - rep_nop(); - hpet_end = hpet_readl(HPET_COUNTER); - } while ((hpet_end - hpet_start) < (loops)); -} - -static struct timer_opts timer_hpet; - -static int __init init_hpet(char* override) -{ - unsigned long result, remain; - - /* check clock override */ - if (override[0] && strncmp(override,"hpet",4)) - return -ENODEV; - - if (!is_hpet_enabled()) - return -ENODEV; - - printk("Using HPET for gettimeofday\n"); - if (cpu_has_tsc) { - unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient); - if (tsc_quotient) { - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, - eax, edx); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - } - /* set this only when cpu_has_tsc */ - timer_hpet.read_timer = read_timer_tsc; - } - - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - hpet_usec_quotient = result; - - return 0; -} - -static int hpet_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); - - if (hpet_use_timer) - hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - hpet_last = hpet_readl(HPET_COUNTER); - write_sequnlock(&monotonic_lock); - return 0; -} -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_hpet __read_mostly = { - .name = "hpet", - .mark_offset = mark_offset_hpet, - .get_offset = get_offset_hpet, - .monotonic_clock = monotonic_clock_hpet, - .delay = delay_hpet, - .resume = hpet_resume, -}; - -struct init_timer_opts __initdata timer_hpet_init = { - .init = init_hpet, - .opts = &timer_hpet, -}; diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c deleted file mode 100644 index 4ea2f414dbbd..000000000000 --- a/arch/i386/kernel/timers/timer_none.c +++ /dev/null @@ -1,39 +0,0 @@ -#include <linux/init.h> -#include <asm/timer.h> - -static void mark_offset_none(void) -{ - /* nothing needed */ -} - -static unsigned long get_offset_none(void) -{ - return 0; -} - -static unsigned long long monotonic_clock_none(void) -{ - return 0; -} - -static void delay_none(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - -/* none timer_opts struct */ -struct timer_opts timer_none = { - .name = "none", - .mark_offset = mark_offset_none, - .get_offset = get_offset_none, - .monotonic_clock = monotonic_clock_none, - .delay = delay_none, -}; diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c deleted file mode 100644 index b9b6bd56b9ba..000000000000 --- a/arch/i386/kernel/timers/timer_pit.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/timex.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> -#include <asm/i8253.h> - -#include "do_timer.h" -#include "io_ports.h" - -static int count_p; /* counter in get_offset_pit() */ - -static int __init init_pit(char* override) -{ - /* check clock override */ - if (override[0] && strncmp(override,"pit",3)) - printk(KERN_ERR "Warning: clock= override failed. Defaulting " - "to PIT\n"); - init_cpu_khz(); - count_p = LATCH; - return 0; -} - -static void mark_offset_pit(void) -{ - /* nothing needed */ -} - -static unsigned long long monotonic_clock_pit(void) -{ - return 0; -} - -static void delay_pit(unsigned long loops) -{ - int d0; - __asm__ __volatile__( - "\tjmp 1f\n" - ".align 16\n" - "1:\tjmp 2f\n" - ".align 16\n" - "2:\tdecl %0\n\tjns 2b" - :"=&a" (d0) - :"0" (loops)); -} - - -/* This function must be called with xtime_lock held. - * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs - * - * However, the pc-audio speaker driver changes the divisor so that - * it gets interrupted rather more often - it loads 64 into the - * counter rather than 11932! This has an adverse impact on - * do_gettimeoffset() -- it stops working! What is also not - * good is that the interval that our timer function gets called - * is no longer 10.0002 ms, but 9.9767 ms. To get around this - * would require using a different timing source. Maybe someone - * could use the RTC - I know that this can interrupt at frequencies - * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix - * it so that at startup, the timer code in sched.c would select - * using either the RTC or the 8253 timer. The decision would be - * based on whether there was any other device around that needed - * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz, - * and then do some jiggery to have a version of do_timer that - * advanced the clock by 1/1024 s. Every time that reached over 1/100 - * of a second, then do all the old code. If the time was kept correct - * then do_gettimeoffset could just return 0 - there is no low order - * divider that can be accessed. - * - * Ideally, you would be able to use the RTC for the speaker driver, - * but it appears that the speaker driver really needs interrupt more - * often than every 120 us or so. - * - * Anyway, this needs more thought.... pjsg (1993-08-28) - * - * If you are really that interested, you should be reading - * comp.protocols.time.ntp! - */ - -static unsigned long get_offset_pit(void) -{ - int count; - unsigned long flags; - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have xtime_lock. - */ - unsigned long jiffies_t; - - spin_lock_irqsave(&i8253_lock, flags); - /* timer count may underflow right here */ - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - - /* - * We do this guaranteed double memory access instead of a _p - * postfix in the previous port access. Wheee, hackady hack - */ - jiffies_t = jiffies; - - count |= inb_p(PIT_CH0) << 8; - - /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there are two kinds of problems that must be avoided here: - * 1. the timer counter underflows - * 2. hardware problem with the timer, not giving us continuous time, - * the counter does small "jumps" upwards on some Pentium systems, - * (see c't 95/10 page 335 for Neptun bug.) - */ - - if( jiffies_t == jiffies_p ) { - if( count > count_p ) { - /* the nutcase */ - count = do_timer_overflow(count); - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - spin_unlock_irqrestore(&i8253_lock, flags); - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - - -/* tsc timer_opts struct */ -struct timer_opts timer_pit = { - .name = "pit", - .mark_offset = mark_offset_pit, - .get_offset = get_offset_pit, - .monotonic_clock = monotonic_clock_pit, - .delay = delay_pit, -}; - -struct init_timer_opts __initdata timer_pit_init = { - .init = init_pit, - .opts = &timer_pit, -}; - -void setup_pit_timer(void) -{ - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c deleted file mode 100644 index 144e94a04933..000000000000 --- a/arch/i386/kernel/timers/timer_pm.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * (C) Dominik Brodowski <linux@brodo.de> 2003 - * - * Driver to use the Power Management Timer (PMTMR) available in some - * southbridges as primary timing source for the Linux kernel. - * - * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c, - * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4. - * - * This file is licensed under the GPL v2. - */ - - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <asm/types.h> -#include <asm/timer.h> -#include <asm/smp.h> -#include <asm/io.h> -#include <asm/arch_hooks.h> - -#include <linux/timex.h> -#include "mach_timer.h" - -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 -#define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) - - -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/acpi/boot.c */ -u32 pmtmr_ioport = 0; - - -/* value of the Power timer at last timer interrupt */ -static u32 offset_tick; -static u32 offset_delay; - -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - -static int pmtmr_need_workaround __read_mostly = 1; - -/*helper function to safely read acpi pm timesource*/ -static inline u32 read_pmtmr(void) -{ - if (pmtmr_need_workaround) { - u32 v1, v2, v3; - - /* It has been reported that because of various broken - * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time - * source is not latched, so you must read it multiple - * times to insure a safe value is read. - */ - do { - v1 = inl(pmtmr_ioport); - v2 = inl(pmtmr_ioport); - v3 = inl(pmtmr_ioport); - } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2)); - - /* mask the output to 24 bits */ - return v2 & ACPI_PM_MASK; - } - - return inl(pmtmr_ioport) & ACPI_PM_MASK; -} - - -/* - * Some boards have the PMTMR running way too fast. We check - * the PMTMR rate against PIT channel 2 to catch these cases. - */ -static int verify_pmtmr_rate(void) -{ - u32 value1, value2; - unsigned long count, delta; - - mach_prepare_counter(); - value1 = read_pmtmr(); - mach_countup(&count); - value2 = read_pmtmr(); - delta = (value2 - value1) & ACPI_PM_MASK; - - /* Check that the PMTMR delta is within 5% of what we expect */ - if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 || - delta > (PMTMR_EXPECTED_RATE * 21) / 20) { - printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE); - return -1; - } - - return 0; -} - - -static int init_pmtmr(char* override) -{ - u32 value1, value2; - unsigned int i; - - if (override[0] && strncmp(override,"pmtmr",5)) - return -ENODEV; - - if (!pmtmr_ioport) - return -ENODEV; - - /* we use the TSC for delay_pmtmr, so make sure it exists */ - if (!cpu_has_tsc) - return -ENODEV; - - /* "verify" this timing source */ - value1 = read_pmtmr(); - for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2); - return -EINVAL; - } - printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1); - return -ENODEV; - -pm_good: - if (verify_pmtmr_rate() != 0) - return -ENODEV; - - init_cpu_khz(); - return 0; -} - -static inline u32 cyc2us(u32 cycles) -{ - /* The Power Management Timer ticks at 3.579545 ticks per microsecond. - * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%] - * - * Even with HZ = 100, delta is at maximum 35796 ticks, so it can - * easily be multiplied with 286 (=0x11E) without having to fear - * u32 overflows. - */ - cycles *= 286; - return (cycles >> 10); -} - -/* - * this gets called during each timer interrupt - * - Called while holding the writer xtime_lock - */ -static void mark_offset_pmtmr(void) -{ - u32 lost, delta, last_offset; - static int first_run = 1; - last_offset = offset_tick; - - write_seqlock(&monotonic_lock); - - offset_tick = read_pmtmr(); - - /* calculate tick interval */ - delta = (offset_tick - last_offset) & ACPI_PM_MASK; - - /* convert to usecs */ - delta = cyc2us(delta); - - /* update the monotonic base value */ - monotonic_base += delta * NSEC_PER_USEC; - write_sequnlock(&monotonic_lock); - - /* convert to ticks */ - delta += offset_delay; - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - - /* compensate for lost ticks */ - if (lost >= 2) - jiffies_64 += lost - 1; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } -} - -static int pmtmr_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - offset_tick = read_pmtmr(); - write_sequnlock(&monotonic_lock); - return 0; -} - -static unsigned long long monotonic_clock_pmtmr(void) -{ - u32 last_offset, this_offset; - unsigned long long base, ret; - unsigned seq; - - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = offset_tick; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the pmtmr */ - this_offset = read_pmtmr(); - - /* convert to nanoseconds */ - ret = (this_offset - last_offset) & ACPI_PM_MASK; - ret = base + (cyc2us(ret) * NSEC_PER_USEC); - return ret; -} - -static void delay_pmtmr(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - - -/* - * get the offset (in microseconds) from the last call to mark_offset() - * - Called holding a reader xtime_lock - */ -static unsigned long get_offset_pmtmr(void) -{ - u32 now, offset, delta = 0; - - offset = offset_tick; - now = read_pmtmr(); - delta = (now - offset)&ACPI_PM_MASK; - - return (unsigned long) offset_delay + cyc2us(delta); -} - - -/* acpi timer_opts struct */ -static struct timer_opts timer_pmtmr = { - .name = "pmtmr", - .mark_offset = mark_offset_pmtmr, - .get_offset = get_offset_pmtmr, - .monotonic_clock = monotonic_clock_pmtmr, - .delay = delay_pmtmr, - .read_timer = read_timer_tsc, - .resume = pmtmr_resume, -}; - -struct init_timer_opts __initdata timer_pmtmr_init = { - .init = init_pmtmr, - .opts = &timer_pmtmr, -}; - -#ifdef CONFIG_PCI -/* - * PIIX4 Errata: - * - * The power management timer may return improper results when read. - * Although the timer value settles properly after incrementing, - * while incrementing there is a 3 ns window every 69.8 ns where the - * timer value is indeterminate (a 4.2% chance that the data will be - * incorrect when read). As a result, the ACPI free running count up - * timer specification is violated due to erroneous reads. - */ -static int __init pmtmr_bug_check(void) -{ - static struct pci_device_id gray_list[] __initdata = { - /* these chipsets may have bug. */ - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82801DB_0) }, - { }, - }; - struct pci_dev *dev; - int pmtmr_has_bug = 0; - u8 rev; - - if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround) - return 0; - - dev = pci_get_device(PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_82371AB_3, NULL); - if (dev) { - pci_read_config_byte(dev, PCI_REVISION_ID, &rev); - /* the bug has been fixed in PIIX4M */ - if (rev < 3) { - printk(KERN_WARNING "* Found PM-Timer Bug on this " - "chipset. Due to workarounds for a bug,\n" - "* this time source is slow. Consider trying " - "other time sources (clock=)\n"); - pmtmr_has_bug = 1; - } - pci_dev_put(dev); - } - - if (pci_dev_present(gray_list)) { - printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due" - " to workarounds for a bug,\n" - "* this time source is slow. If you are sure your timer" - " does not have\n" - "* this bug, please use \"pmtmr_good\" to disable the " - "workaround\n"); - pmtmr_has_bug = 1; - } - - if (!pmtmr_has_bug) - pmtmr_need_workaround = 0; - - return 0; -} -device_initcall(pmtmr_bug_check); -#endif - -static int __init pmtr_good_setup(char *__str) -{ - pmtmr_need_workaround = 0; - return 1; -} -__setup("pmtmr_good", pmtr_good_setup); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>"); -MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86"); diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c deleted file mode 100644 index f1187ddb0d0f..000000000000 --- a/arch/i386/kernel/timers/timer_tsc.c +++ /dev/null @@ -1,617 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - * - * 2004-06-25 Jesper Juhl - * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4 - * failing to inline. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/cpufreq.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -/* processor.h for distable_tsc flag */ -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" - -#include <asm/hpet.h> -#include <asm/i8253.h> - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __devinitdata = 0; - -static int use_tsc; -/* Number of usecs that the last interrupt was delayed */ -static int delay_at_last_interrupt; - -static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ -static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */ -static unsigned long long monotonic_base; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* Avoid compensating for lost ticks before TSCs are synched */ -static int detect_lost_ticks; -static int __init start_lost_tick_compensation(void) -{ - detect_lost_ticks = 1; - return 0; -} -late_initcall(start_lost_tick_compensation); - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_khz * 10^3)) - * ns = cycles * (10^6 / cpu_khz) - * - * Then we use scaling math (suggested by george@mvista.com) to get: - * ns = cycles * (10^6 * SC / cpu_khz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * - * We can use khz divisor instead of mhz to keep a better percision, since - * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. - * (mathieu.desnoyers@polymtl.ca) - * - * -johnstul@us.ibm.com "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -static int count2; /* counter for mark_offset_tsc() */ - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= last_tsc_low; /* tsc_low delta */ - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return delay_at_last_interrupt + edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0)) - && detect_lost_ticks) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned int cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - if (!freq->old){ - ref_freq = freq->new; - goto end; - } - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz); - } - } -#endif - } - -end: - if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - -int recalibrate_cpu_khz(void) -{ -#ifndef CONFIG_SMP - unsigned int cpu_khz_old = cpu_khz; - - if (cpu_has_tsc) { - local_irq_disable(); - init_cpu_khz(); - local_irq_enable(); - cpu_data[0].loops_per_jiffy = - cpufreq_scale(cpu_data[0].loops_per_jiffy, - cpu_khz_old, - cpu_khz); - return 0; - } else - return -ENODEV; -#else - return -ENODEV; -#endif -} -EXPORT_SYMBOL(recalibrate_cpu_khz); - -static void mark_offset_tsc(void) -{ - unsigned long lost,delay; - unsigned long delta = last_tsc_low; - int count; - int countmp; - static int count1 = 0; - unsigned long long this_offset, last_offset; - static int lost_count = 0; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - - /* read Pentium cycle counter */ - - rdtsc(last_tsc_low, last_tsc_high); - - spin_lock(&i8253_lock); - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb(PIT_CH0) << 8; - - /* - * VIA686a test code... reset the latch if count > max + 1 - * from timer_pit.c - cjb - */ - if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); - count = LATCH - 1; - } - - spin_unlock(&i8253_lock); - - if (pit_latch_buggy) { - /* get center value of last 3 time lutch */ - if ((count2 >= count && count >= count1) - || (count1 >= count && count >= count2)) { - count2 = count1; count1 = count; - } else if ((count1 >= count2 && count2 >= count) - || (count >= count2 && count2 >= count1)) { - countmp = count;count = count2; - count2 = count1;count1 = countmp; - } else { - count2 = count1; count1 = count; count = count1; - } - } - - /* lost tick compensation */ - delta = last_tsc_low - delta; - { - register unsigned long eax, edx; - eax = delta; - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - delta = edx; - } - delta += delay_at_last_interrupt; - lost = delta/(1000000/HZ); - delay = delta%(1000000/HZ); - if (lost >= 2 && detect_lost_ticks) { - jiffies_64 += lost-1; - - /* sanity check to ensure we're not always losing ticks */ - if (lost_count++ > 100) { - printk(KERN_WARNING "Losing too many ticks!\n"); - printk(KERN_WARNING "TSC cannot be used as a timesource. \n"); - printk(KERN_WARNING "Possible reasons for this are:\n"); - printk(KERN_WARNING " You're running with Speedstep,\n"); - printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n"); - printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n"); - printk(KERN_WARNING "Falling back to a sane timesource now.\n"); - - clock_fallback(); - } - /* ... but give the TSC a fair chance */ - if (lost_count > 25) - cpufreq_delayed_get(); - } else - lost_count = 0; - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - count = ((LATCH-1) - count) * TICK_SIZE; - delay_at_last_interrupt = (count + LATCH/2) / LATCH; - - /* catch corner case where tick rollover occured - * between tsc and pit reads (as noted when - * usec delta is > 90% # of usecs/tick) - */ - if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ)) - jiffies_64++; -} - -static int __init init_tsc(char* override) -{ - - /* check clock override */ - if (override[0] && strncmp(override,"tsc",3)) { -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) { - printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n"); - } else -#endif - { - return -ENODEV; - } - } - - /* - * If we have APM enabled or the CPU clock speed is variable - * (CPU stops clock on HLT or slows clock to save power) - * then the TSC timestamps may diverge by up to 1 jiffy from - * 'real time' but nothing will break. - * The most frequent case is that the CPU is "woken" from a halt - * state by the timer interrupt itself, so we get 0 error. In the - * rare cases where a driver would "wake" the CPU and request a - * timestamp, the maximum error is < 1 jiffy. But timestamps are - * still perfectly ordered. - * Note that the TSC counter will be reset if APM suspends - * to disk; this won't break the kernel, though, 'cuz we're - * smart. See arch/i386/kernel/apm.c. - */ - /* - * Firstly we have to do a CPU check for chips with - * a potentially buggy TSC. At this point we haven't run - * the ident/bugs checks so we must run this hook as it - * may turn off the TSC flag. - * - * NOTE: this doesn't yet handle SMP 486 machines where only - * some CPU's have a TSC. Thats never worked and nobody has - * moaned if you have the only one in the world - you fix it! - */ - - count2 = LATCH; /* initialize counter for mark_offset_tsc() */ - - if (cpu_has_tsc) { - unsigned long tsc_quotient; -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) { - unsigned long result, remain; - printk("Using TSC for gettimeofday\n"); - tsc_quotient = calibrate_tsc_hpet(NULL); - timer_tsc.mark_offset = &mark_offset_tsc_hpet; - /* - * Math to calculate hpet to usec multiplier - * Look for the comments at get_offset_tsc_hpet() - */ - ASM_DIV64_REG(result, remain, hpet_tick, - 0, KERNEL_TICK_USEC); - if (remain > (hpet_tick >> 1)) - result++; /* rounding the result */ - - hpet_usec_quotient = result; - } else -#endif - { - tsc_quotient = calibrate_tsc(); - } - - if (tsc_quotient) { - fast_gettimeoffset_quotient = tsc_quotient; - use_tsc = 1; - /* - * We could be more selective here I suspect - * and just enable this for the next intel chips ? - */ - /* report CPU clock rate in Hz. - * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) = - * clock/second. Our precision is about 100 ppm. - */ - { unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (cpu_khz), "=d" (edx) - :"r" (tsc_quotient), - "0" (eax), "1" (edx)); - printk("Detected %u.%03u MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - } - set_cyc2ns_scale(cpu_khz); - return 0; - } - } - return -ENODEV; -} - -static int tsc_resume(void) -{ - write_seqlock(&monotonic_lock); - /* Assume this is the last mark offset time */ - rdtsc(last_tsc_low, last_tsc_high); -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled() && hpet_use_timer) - hpet_last = hpet_readl(HPET_COUNTER); -#endif - write_sequnlock(&monotonic_lock); - return 0; -} - -#ifndef CONFIG_X86_TSC -/* disable flag for tsc. Takes effect by clearing the TSC cpu flag - * in cpu/common.c */ -static int __init tsc_setup(char *str) -{ - tsc_disable = 1; - return 1; -} -#else -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC.\n"); - return 1; -} -#endif -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -static struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, - .read_timer = read_timer_tsc, - .resume = tsc_resume, -}; - -struct init_timer_opts __initdata timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c index 296355292c7c..07d6da36a825 100644 --- a/arch/i386/kernel/topology.c +++ b/arch/i386/kernel/topology.c @@ -28,19 +28,13 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/nodemask.h> +#include <linux/mmzone.h> #include <asm/cpu.h> static struct i386_cpu cpu_devices[NR_CPUS]; -int arch_register_cpu(int num){ - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - if (node_online(node)) - parent = &node_devices[node].node; -#endif /* CONFIG_NUMA */ - +int arch_register_cpu(int num) +{ /* * CPU0 cannot be offlined due to several * restrictions and assumptions in kernel. This basically @@ -50,57 +44,30 @@ int arch_register_cpu(int num){ if (!num) cpu_devices[num].cpu.no_control = 1; - return register_cpu(&cpu_devices[num].cpu, num, parent); + return register_cpu(&cpu_devices[num].cpu, num); } #ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - struct node *parent = NULL; - -#ifdef CONFIG_NUMA - int node = cpu_to_node(num); - if (node_online(node)) - parent = &node_devices[node].node; -#endif /* CONFIG_NUMA */ - - return unregister_cpu(&cpu_devices[num].cpu, parent); + return unregister_cpu(&cpu_devices[num].cpu); } EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ - - -#ifdef CONFIG_NUMA -#include <linux/mmzone.h> -#include <asm/node.h> - -struct i386_node node_devices[MAX_NUMNODES]; - static int __init topology_init(void) { int i; +#ifdef CONFIG_NUMA for_each_online_node(i) - arch_register_node(i); - - for_each_present_cpu(i) - arch_register_cpu(i); - return 0; -} - -#else /* !CONFIG_NUMA */ - -static int __init topology_init(void) -{ - int i; + register_one_node(i); +#endif /* CONFIG_NUMA */ for_each_present_cpu(i) arch_register_cpu(i); return 0; } -#endif /* CONFIG_NUMA */ - subsys_initcall(topology_init); diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index dcc14477af1f..6820b8d643c7 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -11,7 +11,6 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'asm.s'. */ -#include <linux/config.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/string.h> @@ -28,6 +27,8 @@ #include <linux/utsname.h> #include <linux/kprobes.h> #include <linux/kexec.h> +#include <linux/unwind.h> +#include <linux/uaccess.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -40,22 +41,24 @@ #include <asm/processor.h> #include <asm/system.h> -#include <asm/uaccess.h> #include <asm/io.h> #include <asm/atomic.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/i387.h> #include <asm/nmi.h> - +#include <asm/unwind.h> #include <asm/smp.h> #include <asm/arch_hooks.h> #include <asm/kdebug.h> +#include <asm/stacktrace.h> #include <linux/module.h> #include "mach_traps.h" +int panic_on_unrecovered_nmi; + asmlinkage int system_call(void); struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, @@ -92,6 +95,11 @@ asmlinkage void spurious_interrupt_bug(void); asmlinkage void machine_check(void); static int kstack_depth_to_print = 24; +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif ATOMIC_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) @@ -99,13 +107,13 @@ int register_die_notifier(struct notifier_block *nb) vmalloc_sync_all(); return atomic_notifier_chain_register(&i386die_chain, nb); } -EXPORT_SYMBOL(register_die_notifier); +EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&i386die_chain, nb); } -EXPORT_SYMBOL(unregister_die_notifier); +EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) { @@ -113,42 +121,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) p < (void *)tinfo + THREAD_SIZE - 3; } -/* - * Print CONFIG_STACK_BACKTRACE_COLS address/symbol entries per line. - */ -static inline int print_addr_and_symbol(unsigned long addr, char *log_lvl, - int printed) -{ - if (!printed) - printk(log_lvl); - -#if CONFIG_STACK_BACKTRACE_COLS == 1 - printk(" [<%08lx>] ", addr); -#else - printk(" <%08lx> ", addr); -#endif - print_symbol("%s", addr); - - printed = (printed + 1) % CONFIG_STACK_BACKTRACE_COLS; - if (printed) - printk(" "); - else - printk("\n"); - - return printed; -} - static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long ebp, - char *log_lvl) + struct stacktrace_ops *ops, void *data) { unsigned long addr; - int printed = 0; /* nr of entries already printed on current line */ #ifdef CONFIG_FRAME_POINTER while (valid_stack_ptr(tinfo, (void *)ebp)) { addr = *(unsigned long *)(ebp + 4); - printed = print_addr_and_symbol(addr, log_lvl, printed); + ops->address(data, addr); /* * break out of recursive entries (such as * end_of_stack_stop_unwind_function): @@ -161,50 +143,160 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo, while (valid_stack_ptr(tinfo, stack)) { addr = *stack++; if (__kernel_text_address(addr)) - printed = print_addr_and_symbol(addr, log_lvl, printed); + ops->address(data, addr); } #endif - if (printed) - printk("\n"); - return ebp; } -static void show_trace_log_lvl(struct task_struct *task, - unsigned long *stack, char *log_lvl) +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static asmlinkage int +dump_trace_unwind(struct unwind_frame_info *info, void *data) +{ + struct ops_and_data *oad = (struct ops_and_data *)data; + int n = 0; + + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + oad->ops->address(oad->data, UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + } + return n; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, + struct stacktrace_ops *ops, void *data) { - unsigned long ebp; + unsigned long ebp = 0; if (!task) task = current; - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; + + if (regs) { + if (unwind_init_frame_info(&info, task, regs) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } else if (task == current) + unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + else { + if (unwind_init_blocked(&info, task) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + UNW_PC(&info)); + if (UNW_SP(&info) >= PAGE_OFFSET) { + ops->warning(data, "Leftover inexact backtrace:\n"); + stack = (void *)UNW_SP(&info); + if (!stack) + return; + ebp = UNW_FP(&info); + } else + ops->warning(data, "Full inexact backtrace again:\n"); + } else if (call_trace >= 1) + return; + else + ops->warning(data, "Full inexact backtrace again:\n"); + } else + ops->warning(data, "Inexact backtrace:\n"); + } + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.esp; } +#ifdef CONFIG_FRAME_POINTER + if (!ebp) { + if (task == current) { + /* Grab ebp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } else { + /* ebp is the last reg pushed by switch_to */ + ebp = *(unsigned long *) task->thread.esp; + } + } +#endif + while (1) { struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, log_lvl); + ebp = print_context_stack(context, stack, ebp, ops, data); + /* Should be after the line below, but somewhere + in early boot context comes out corrupted and we + can't reference it -AK */ + if (ops->stack(data, "IRQ") < 0) + break; stack = (unsigned long*)context->previous_esp; if (!stack) break; - printk("%s =======================\n", log_lvl); } } +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + print_symbol("%s\n", addr); +} + +static struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; -void show_trace(struct task_struct *task, unsigned long * stack) +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack, char *log_lvl) { - show_trace_log_lvl(task, stack, ""); + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); } -static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, - char *log_lvl) +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long * stack) +{ + show_trace_log_lvl(task, regs, stack, ""); +} + +static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *esp, char *log_lvl) { unsigned long *stack; int i; @@ -225,13 +317,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp, printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, esp, log_lvl); + show_trace_log_lvl(task, regs, esp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *esp) { printk(" "); - show_stack_log_lvl(task, esp, ""); + show_stack_log_lvl(task, NULL, esp, ""); } /* @@ -241,7 +333,7 @@ void dump_stack(void) { unsigned long stack; - show_trace(current, &stack); + show_trace(current, NULL, &stack); } EXPORT_SYMBOL(dump_stack); @@ -261,8 +353,9 @@ void show_registers(struct pt_regs *regs) ss = regs->xss & 0xffff; } print_modules(); - printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n" - "EFLAGS: %08lx (%s %.*s) \n", + printk(KERN_EMERG "CPU: %d\n" + KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" + KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, print_tainted(), regs->eflags, system_utsname.release, (int)strcspn(system_utsname.version, " "), @@ -283,16 +376,21 @@ void show_registers(struct pt_regs *regs) */ if (in_kernel) { u8 __user *eip; + int code_bytes = 64; + unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG); printk(KERN_EMERG "Code: "); eip = (u8 __user *)regs->eip - 43; - for (i = 0; i < 64; i++, eip++) { - unsigned char c; - + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { + /* try starting at EIP */ + eip = (u8 __user *)regs->eip; + code_bytes = 32; + } + for (i = 0; i < code_bytes; i++, eip++) { if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { printk(" Bad EIP value."); break; @@ -308,35 +406,36 @@ void show_registers(struct pt_regs *regs) static void handle_BUG(struct pt_regs *regs) { + unsigned long eip = regs->eip; unsigned short ud2; - unsigned short line; - char *file; - char c; - unsigned long eip; - - eip = regs->eip; if (eip < PAGE_OFFSET) - goto no_bug; - if (__get_user(ud2, (unsigned short __user *)eip)) - goto no_bug; + return; + if (probe_kernel_address((unsigned short __user *)eip, ud2)) + return; if (ud2 != 0x0b0f) - goto no_bug; - if (__get_user(line, (unsigned short __user *)(eip + 2))) - goto bug; - if (__get_user(file, (char * __user *)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = "<bad filename>"; + return; printk(KERN_EMERG "------------[ cut here ]------------\n"); - printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); -no_bug: - return; +#ifdef CONFIG_DEBUG_BUGVERBOSE + do { + unsigned short line; + char *file; + char c; + + if (probe_kernel_address((unsigned short __user *)(eip + 2), + line)) + break; + if (__get_user(file, (char * __user *)(eip + 4)) || + (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) + file = "<bad filename>"; - /* Here we know it was a BUG but file-n-line is unavailable */ -bug: - printk(KERN_EMERG "Kernel BUG\n"); + printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); + return; + } while (0); +#endif + printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n"); } /* This is gone through when something in the kernel @@ -426,11 +525,9 @@ void die(const char * str, struct pt_regs * regs, long err) if (in_interrupt()) panic("Fatal exception in interrupt"); - if (panic_on_oops) { - printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - ssleep(5); + if (panic_on_oops) panic("Fatal exception"); - } + oops_exit(); do_exit(SIGSEGV); } @@ -601,18 +698,24 @@ gp_in_kernel: } } -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs * regs) { - printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying " - "to continue\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); printk(KERN_EMERG "You probably have a hardware problem with your RAM " "chips\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ clear_mem_error(reason); } -static void io_check_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +io_check_error(unsigned char reason, struct pt_regs * regs) { unsigned long i; @@ -628,7 +731,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs) outb(reason, 0x61); } -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +static __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs * regs) { #ifdef CONFIG_MCA /* Might actually be able to figure out what the guilty party @@ -638,15 +742,18 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) return; } #endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " + "CPU %d.\n", reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } static DEFINE_SPINLOCK(nmi_print_lock); -void die_nmi (struct pt_regs *regs, const char *msg) +void __kprobes die_nmi(struct pt_regs *regs, const char *msg) { if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP) @@ -678,7 +785,7 @@ void die_nmi (struct pt_regs *regs, const char *msg) do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -695,12 +802,12 @@ static void default_do_nmi(struct pt_regs * regs) * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs); + if (nmi_watchdog_tick(regs, reason)) return; - } + if (!do_nmi_callback(regs, smp_processor_id())) #endif - unknown_nmi_error(reason, regs); + unknown_nmi_error(reason, regs); + return; } if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) @@ -716,14 +823,7 @@ static void default_do_nmi(struct pt_regs * regs) reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -733,25 +833,11 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) ++nmi_count(cpu); - if (!rcu_dereference(nmi_callback)(regs, cpu)) - default_do_nmi(regs); + default_do_nmi(regs); nmi_exit(); } -void set_nmi_callback(nmi_callback_t callback) -{ - vmalloc_sync_all(); - rcu_assign_pointer(nmi_callback, callback); -} -EXPORT_SYMBOL_GPL(set_nmi_callback); - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} -EXPORT_SYMBOL_GPL(unset_nmi_callback); - #ifdef CONFIG_KPROBES fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) { @@ -1091,20 +1177,6 @@ void __init trap_init_f00f_bug(void) } #endif -#define _set_gate(gate_addr,type,dpl,addr,seg) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" ((seg) << 16)); \ -} while (0) - - /* * This needs to use 'idt_table' rather than 'idt', and * thus use the _nonmapped_ version of the IDT, as the @@ -1113,7 +1185,7 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); } /* @@ -1121,22 +1193,22 @@ void set_intr_gate(unsigned int n, void *addr) */ static inline void set_system_intr_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS); + _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_trap_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); } static void __init set_system_gate(unsigned int n, void *addr) { - _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); + _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); } static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { - _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); } @@ -1215,3 +1287,19 @@ static int __init kstack_setup(char *s) return 1; } __setup("kstack=", kstack_setup); + +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 2) + call_trace = 2; + return 1; +} +__setup("call_trace=", call_trace_setup); +#endif diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c new file mode 100644 index 000000000000..b8fa0a8b2e47 --- /dev/null +++ b/arch/i386/kernel/tsc.c @@ -0,0 +1,478 @@ +/* + * This code largely moved from arch/i386/kernel/timer/timer_tsc.c + * which was originally moved from arch/i386/kernel/time.c. + * See comments there for proper credits. + */ + +#include <linux/clocksource.h> +#include <linux/workqueue.h> +#include <linux/cpufreq.h> +#include <linux/jiffies.h> +#include <linux/init.h> +#include <linux/dmi.h> + +#include <asm/delay.h> +#include <asm/tsc.h> +#include <asm/delay.h> +#include <asm/io.h> + +#include "mach_timer.h" + +/* + * On some systems the TSC frequency does not + * change with the cpu frequency. So we need + * an extra value to store the TSC freq + */ +unsigned int tsc_khz; + +int tsc_disable __cpuinitdata = 0; + +#ifdef CONFIG_X86_TSC +static int __init tsc_setup(char *str) +{ + printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " + "cannot disable TSC.\n"); + return 1; +} +#else +/* + * disable flag for tsc. Takes effect by clearing the TSC cpu flag + * in cpu/common.c + */ +static int __init tsc_setup(char *str) +{ + tsc_disable = 1; + + return 1; +} +#endif + +__setup("notsc", tsc_setup); + +/* + * code to mark and check if the TSC is unstable + * due to cpufreq or due to unsynced TSCs + */ +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +/* Accellerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better percision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +static unsigned long cyc2ns_scale __read_mostly; + +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline void set_cyc2ns_scale(unsigned long cpu_khz) +{ + cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + unsigned long long this_offset; + + /* + * in the NUMA case we dont use the TSC as they are not + * synchronized across all CPUs. + */ +#ifndef CONFIG_NUMA + if (!cpu_khz || check_tsc_unstable()) +#endif + /* no locking but a rare wrong value is not a big deal */ + return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); + + /* read the Time Stamp Counter: */ + rdtscll(this_offset); + + /* return the value in ns */ + return cycles_2_ns(this_offset); +} + +static unsigned long calculate_cpu_khz(void) +{ + unsigned long long start, end; + unsigned long count; + u64 delta64; + int i; + unsigned long flags; + + local_irq_save(flags); + + /* run 3 times to ensure the cache is warm */ + for (i = 0; i < 3; i++) { + mach_prepare_counter(); + rdtscll(start); + mach_countup(&count); + rdtscll(end); + } + /* + * Error: ECTCNEVERSET + * The CTC wasn't reliable: we got a hit on the very first read, + * or the CPU was so fast/slow that the quotient wouldn't fit in + * 32 bits.. + */ + if (count <= 1) + goto err; + + delta64 = end - start; + + /* cpu freq too fast: */ + if (delta64 > (1ULL<<32)) + goto err; + + /* cpu freq too slow: */ + if (delta64 <= CALIBRATE_TIME_MSEC) + goto err; + + delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */ + do_div(delta64,CALIBRATE_TIME_MSEC); + + local_irq_restore(flags); + return (unsigned long)delta64; +err: + local_irq_restore(flags); + return 0; +} + +int recalibrate_cpu_khz(void) +{ +#ifndef CONFIG_SMP + unsigned long cpu_khz_old = cpu_khz; + + if (cpu_has_tsc) { + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + cpu_data[0].loops_per_jiffy = + cpufreq_scale(cpu_data[0].loops_per_jiffy, + cpu_khz_old, cpu_khz); + return 0; + } else + return -ENODEV; +#else + return -ENODEV; +#endif +} + +EXPORT_SYMBOL(recalibrate_cpu_khz); + +void __init tsc_init(void) +{ + if (!cpu_has_tsc || tsc_disable) + return; + + cpu_khz = calculate_cpu_khz(); + tsc_khz = cpu_khz; + + if (!cpu_khz) + return; + + printk("Detected %lu.%03lu MHz processor.\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); + + set_cyc2ns_scale(cpu_khz); + use_tsc_delay(); +} + +#ifdef CONFIG_CPU_FREQ + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) + cpufreq_get(cpu); + + cpufreq_delayed_issched = 0; +} + +/* + * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries + * to verify the CPU frequency the timing core thinks the CPU is running + * at is still correct. + */ +static inline void cpufreq_delayed_get(void) +{ + if (cpufreq_init && !cpufreq_delayed_issched) { + cpufreq_delayed_issched = 1; + printk(KERN_DEBUG "Checking if CPU frequency changed.\n"); + schedule_work(&cpufreq_delayed_get_work); + } +} + +/* + * if the CPU frequency is scaled, TSC-based delays will need a different + * loops_per_jiffy value to function properly. + */ +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; +static unsigned long cpu_khz_ref = 0; + +static int +time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = data; + + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_seqlock_irq(&xtime_lock); + + if (!ref_freq) { + if (!freq->old){ + ref_freq = freq->new; + goto end; + } + ref_freq = freq->old; + loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + cpu_khz_ref = cpu_khz; + } + + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + cpu_data[freq->cpu].loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, + ref_freq, freq->new); + + if (cpu_khz) { + + if (num_online_cpus() == 1) + cpu_khz = cpufreq_scale(cpu_khz_ref, + ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { + tsc_khz = cpu_khz; + set_cyc2ns_scale(cpu_khz); + /* + * TSC based sched_clock turns + * to junk w/ cpufreq + */ + mark_tsc_unstable(); + } + } + } +end: + if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE) + write_sequnlock_irq(&xtime_lock); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + int ret; + + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + if (!ret) + cpufreq_init = 1; + + return ret; +} + +core_initcall(cpufreq_tsc); + +#endif + +/* clock source code */ + +static unsigned long current_tsc_khz = 0; +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = CLOCKSOURCE_MASK(64), + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + + /* only update if tsc_khz has changed: */ + if (current_tsc_khz != tsc_khz) { + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + change = 1; + } + + return change; +} + +static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", + d->ident); + mark_tsc_unstable(); + return 0; +} + +/* List of systems that have known TSC problems */ +static struct dmi_system_id __initdata bad_tsc_dmi_table[] = { + { + .callback = dmi_mark_tsc_unstable, + .ident = "IBM Thinkpad 380XD", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2635FA0"), + }, + }, + {} +}; + +#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */ +static struct timer_list verify_tsc_freq_timer; + +/* XXX - Probably should add locking */ +static void verify_tsc_freq(unsigned long unused) +{ + static u64 last_tsc; + static unsigned long last_jiffies; + + u64 now_tsc, interval_tsc; + unsigned long now_jiffies, interval_jiffies; + + + if (check_tsc_unstable()) + return; + + rdtscll(now_tsc); + now_jiffies = jiffies; + + if (!last_jiffies) { + goto out; + } + + interval_jiffies = now_jiffies - last_jiffies; + interval_tsc = now_tsc - last_tsc; + interval_tsc *= HZ; + do_div(interval_tsc, cpu_khz*1000); + + if (interval_tsc < (interval_jiffies * 3 / 4)) { + printk("TSC appears to be running slowly. " + "Marking it as unstable\n"); + mark_tsc_unstable(); + return; + } + +out: + last_tsc = now_tsc; + last_jiffies = now_jiffies; + /* set us up to go off on the next interval: */ + mod_timer(&verify_tsc_freq_timer, + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL)); +} + +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +static __init int unsynchronized_tsc(void) +{ + /* + * Intel systems are normally all synchronized. + * Exceptions must mark TSC as unstable: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + return 0; + + /* assume multi socket systems are not synchronized: */ + return num_possible_cpus() > 1; +} + +static int __init init_tsc_clocksource(void) +{ + + if (cpu_has_tsc && tsc_khz && !tsc_disable) { + /* check blacklist */ + dmi_check_system(bad_tsc_dmi_table); + + if (unsynchronized_tsc()) /* mark unstable if unsynced */ + mark_tsc_unstable(); + current_tsc_khz = tsc_khz; + clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz, + clocksource_tsc.shift); + /* lower the rating if we already know its unstable: */ + if (check_tsc_unstable()) + clocksource_tsc.rating = 50; + + init_timer(&verify_tsc_freq_timer); + verify_tsc_freq_timer.function = verify_tsc_freq; + verify_tsc_freq_timer.expires = + jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL); + add_timer(&verify_tsc_freq_timer); + + return clocksource_register(&clocksource_tsc); + } + + return 0; +} + +module_init(init_tsc_clocksource); diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index 00e0118e717c..8355d8d87d18 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -31,7 +31,6 @@ */ #include <linux/capability.h> -#include <linux/config.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/sched.h> diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 8831303a473f..1e7ac1c44ddc 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; + +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(4); /* R__ */ +} SECTIONS { . = __KERNEL_START; @@ -26,7 +32,7 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } = 0x9090 + } :text = 0x9090 _etext = .; /* End of text section */ @@ -37,11 +43,18 @@ SECTIONS RODATA + . = ALIGN(4); + __tracedata_start = .; + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + *(.tracedata) + } + __tracedata_end = .; + /* writeable */ .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS - } + } :data . = ALIGN(4096); __nosave_begin = .; @@ -64,6 +77,15 @@ SECTIONS .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ +#ifdef CONFIG_STACK_UNWIND + . = ALIGN(4); + .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { + __start_unwind = .; + *(.eh_frame) + __end_unwind = .; + } +#endif + . = ALIGN(THREAD_SIZE); /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { *(.data.init_task) @@ -168,4 +190,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + NOTES } diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S index 3b62baa6a371..1a36d26e15eb 100644 --- a/arch/i386/kernel/vsyscall-sysenter.S +++ b/arch/i386/kernel/vsyscall-sysenter.S @@ -42,10 +42,10 @@ __kernel_vsyscall: /* 7: align return point with nop's to make disassembly easier */ .space 7,0x90 - /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */ + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by entry.S. */ + .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ SYSENTER_RETURN: pop %ebp .Lpop_ebp: diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S index 98699ca6e52d..f66cd11adb72 100644 --- a/arch/i386/kernel/vsyscall.lds.S +++ b/arch/i386/kernel/vsyscall.lds.S @@ -7,9 +7,10 @@ SECTIONS { - . = VSYSCALL_BASE + SIZEOF_HEADERS; + . = VDSO_PRELINK + SIZEOF_HEADERS; .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } @@ -20,7 +21,7 @@ SECTIONS For the layouts to match, we need to skip more than enough space for the dynamic symbol table et al. If this amount is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; + . = VDSO_PRELINK + 0x400; .text : { *(.text) } :text =0x90909090 .note : { *(.note.*) } :text :note |