summaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile15
-rw-r--r--arch/i386/kernel/acpi/Makefile2
-rw-r--r--arch/i386/kernel/acpi/boot.c184
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c6
-rw-r--r--arch/i386/kernel/acpi/wakeup.S5
-rw-r--r--arch/i386/kernel/alternative.c134
-rw-r--r--arch/i386/kernel/apic.c48
-rw-r--r--arch/i386/kernel/apm.c69
-rw-r--r--arch/i386/kernel/asm-offsets.c7
-rw-r--r--arch/i386/kernel/bootflag.c1
-rw-r--r--arch/i386/kernel/cpu/amd.c29
-rw-r--r--arch/i386/kernel/cpu/centaur.c24
-rw-r--r--arch/i386/kernel/cpu/common.c33
-rw-r--r--arch/i386/kernel/cpu/cpu.h2
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig3
-rw-r--r--arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c62
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c361
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.h48
-rw-r--r--arch/i386/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/i386/kernel/cpu/cpufreq/powernow-k7.c1
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c55
-rw-r--r--arch/i386/kernel/cpu/cyrix.c44
-rw-r--r--arch/i386/kernel/cpu/intel.c10
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c125
-rw-r--r--arch/i386/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/i386/kernel/cpu/mcheck/k7.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c27
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c180
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c4
-rw-r--r--arch/i386/kernel/cpu/nexgen.c9
-rw-r--r--arch/i386/kernel/cpu/proc.c12
-rw-r--r--arch/i386/kernel/cpu/rise.c4
-rw-r--r--arch/i386/kernel/cpu/transmeta.c7
-rw-r--r--arch/i386/kernel/cpu/umc.c7
-rw-r--r--arch/i386/kernel/cpuid.c9
-rw-r--r--arch/i386/kernel/crash.c36
-rw-r--r--arch/i386/kernel/doublefault.c3
-rw-r--r--arch/i386/kernel/efi.c11
-rw-r--r--arch/i386/kernel/efi_stub.S2
-rw-r--r--arch/i386/kernel/entry.S424
-rw-r--r--arch/i386/kernel/head.S82
-rw-r--r--arch/i386/kernel/hpet.c67
-rw-r--r--arch/i386/kernel/i386_ksyms.c1
-rw-r--r--arch/i386/kernel/i387.c1
-rw-r--r--arch/i386/kernel/i8253.c118
-rw-r--r--arch/i386/kernel/i8259.c15
-rw-r--r--arch/i386/kernel/io_apic.c196
-rw-r--r--arch/i386/kernel/ioport.c1
-rw-r--r--arch/i386/kernel/irq.c34
-rw-r--r--arch/i386/kernel/kprobes.c97
-rw-r--r--arch/i386/kernel/machine_kexec.c155
-rw-r--r--arch/i386/kernel/mca.c9
-rw-r--r--arch/i386/kernel/microcode.c775
-rw-r--r--arch/i386/kernel/mpparse.c71
-rw-r--r--arch/i386/kernel/msr.c12
-rw-r--r--arch/i386/kernel/nmi.c948
-rw-r--r--arch/i386/kernel/numaq.c11
-rw-r--r--arch/i386/kernel/process.c73
-rw-r--r--arch/i386/kernel/ptrace.c15
-rw-r--r--arch/i386/kernel/quirks.c1
-rw-r--r--arch/i386/kernel/reboot.c13
-rw-r--r--arch/i386/kernel/relocate_kernel.S162
-rw-r--r--arch/i386/kernel/scx200.c67
-rw-r--r--arch/i386/kernel/semaphore.c135
-rw-r--r--arch/i386/kernel/setup.c550
-rw-r--r--arch/i386/kernel/signal.c4
-rw-r--r--arch/i386/kernel/smp.c78
-rw-r--r--arch/i386/kernel/smpboot.c139
-rw-r--r--arch/i386/kernel/srat.c110
-rw-r--r--arch/i386/kernel/syscall_table.S1
-rw-r--r--arch/i386/kernel/sysenter.c126
-rw-r--r--arch/i386/kernel/time.c234
-rw-r--r--arch/i386/kernel/time_hpet.c38
-rw-r--r--arch/i386/kernel/timers/Makefile9
-rw-r--r--arch/i386/kernel/timers/common.c172
-rw-r--r--arch/i386/kernel/timers/timer.c75
-rw-r--r--arch/i386/kernel/timers/timer_cyclone.c259
-rw-r--r--arch/i386/kernel/timers/timer_hpet.c217
-rw-r--r--arch/i386/kernel/timers/timer_none.c39
-rw-r--r--arch/i386/kernel/timers/timer_pit.c177
-rw-r--r--arch/i386/kernel/timers/timer_pm.c342
-rw-r--r--arch/i386/kernel/timers/timer_tsc.c617
-rw-r--r--arch/i386/kernel/topology.c49
-rw-r--r--arch/i386/kernel/traps.c374
-rw-r--r--arch/i386/kernel/tsc.c478
-rw-r--r--arch/i386/kernel/vm86.c1
-rw-r--r--arch/i386/kernel/vmlinux.lds.S28
-rw-r--r--arch/i386/kernel/vsyscall-sysenter.S4
-rw-r--r--arch/i386/kernel/vsyscall.lds.S5
92 files changed, 4843 insertions, 4353 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 96fb8a020af2..1a884b6e6e5c 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -4,13 +4,13 @@
extra-y := head.o init_task.o vmlinux.lds
-obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bootflag.o \
- quirks.o i8237.o topology.o alternative.o
+ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
+obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += cpu/
-obj-y += timers/
obj-y += acpi/
obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
obj-$(CONFIG_MCA) += mca.o
@@ -37,6 +37,8 @@ obj-$(CONFIG_EFI) += efi.o efi_stub.o
obj-$(CONFIG_DOUBLEFAULT) += doublefault.o
obj-$(CONFIG_VM86) += vm86.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_HPET_TIMER) += hpet.o
+obj-$(CONFIG_K8_NB) += k8.o
EXTRA_AFLAGS := -traditional
@@ -56,7 +58,8 @@ quiet_cmd_syscall = SYSCALL $@
export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH)
-vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1
+vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \
+ $(call ld-option, -Wl$(comma)--hash-style=sysv)
SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags)
SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags)
@@ -76,3 +79,7 @@ SYSCFLAGS_vsyscall-syms.o = -r
$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
$(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
$(call if_changed,syscall)
+
+k8-y += ../../x86_64/kernel/k8.o
+stacktrace-y += ../../x86_64/kernel/stacktrace.o
+
diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile
index 7e9ac99354f4..7f7be01f44e6 100644
--- a/arch/i386/kernel/acpi/Makefile
+++ b/arch/i386/kernel/acpi/Makefile
@@ -1,5 +1,7 @@
obj-$(CONFIG_ACPI) += boot.o
+ifneq ($(CONFIG_PCI),)
obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o
+endif
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 97ca17189af5..1aaea6ab8c46 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -24,12 +24,14 @@
*/
#include <linux/init.h>
-#include <linux/config.h>
#include <linux/acpi.h>
#include <linux/efi.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
+#include <linux/bootmem.h>
+#include <linux/ioport.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
@@ -37,11 +39,17 @@
#include <asm/io.h>
#include <asm/mpspec.h>
-#ifdef CONFIG_X86_64
+static int __initdata acpi_force = 0;
-extern void __init clustered_apic_check(void);
+#ifdef CONFIG_ACPI
+int acpi_disabled = 0;
+#else
+int acpi_disabled = 1;
+#endif
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_X86_64
-extern int gsi_irq_sharing(int gsi);
#include <asm/proto.h>
static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
@@ -60,7 +68,7 @@ static inline int gsi_irq_sharing(int gsi) { return gsi; }
#define BAD_MADT_ENTRY(entry, end) ( \
(!entry) || (unsigned long)entry + sizeof(*entry) > end || \
- ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
+ ((acpi_table_entry_header *)entry)->length < sizeof(*entry))
#define PREFIX "ACPI: "
@@ -507,16 +515,76 @@ EXPORT_SYMBOL(acpi_register_gsi);
#ifdef CONFIG_ACPI_HOTPLUG_CPU
int acpi_map_lsapic(acpi_handle handle, int *pcpu)
{
- /* TBD */
- return -EINVAL;
+ struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+ union acpi_object *obj;
+ struct acpi_table_lapic *lapic;
+ cpumask_t tmp_map, new_map;
+ u8 physid;
+ int cpu;
+
+ if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+ return -EINVAL;
+
+ if (!buffer.length || !buffer.pointer)
+ return -EINVAL;
+
+ obj = buffer.pointer;
+ if (obj->type != ACPI_TYPE_BUFFER ||
+ obj->buffer.length < sizeof(*lapic)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ lapic = (struct acpi_table_lapic *)obj->buffer.pointer;
+
+ if ((lapic->header.type != ACPI_MADT_LAPIC) ||
+ (!lapic->flags.enabled)) {
+ kfree(buffer.pointer);
+ return -EINVAL;
+ }
+
+ physid = lapic->id;
+
+ kfree(buffer.pointer);
+ buffer.length = ACPI_ALLOCATE_BUFFER;
+ buffer.pointer = NULL;
+
+ tmp_map = cpu_present_map;
+ mp_register_lapic(physid, lapic->flags.enabled);
+
+ /*
+ * If mp_register_lapic successfully generates a new logical cpu
+ * number, then the following will get us exactly what was mapped
+ */
+ cpus_andnot(new_map, cpu_present_map, tmp_map);
+ if (cpus_empty(new_map)) {
+ printk ("Unable to map lapic to logical cpu number\n");
+ return -EINVAL;
+ }
+
+ cpu = first_cpu(new_map);
+
+ *pcpu = cpu;
+ return 0;
}
EXPORT_SYMBOL(acpi_map_lsapic);
int acpi_unmap_lsapic(int cpu)
{
- /* TBD */
- return -EINVAL;
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (x86_acpiid_to_apicid[i] == x86_cpu_to_apicid[cpu]) {
+ x86_acpiid_to_apicid[i] = -1;
+ break;
+ }
+ }
+ x86_cpu_to_apicid[cpu] = -1;
+ cpu_clear(cpu, cpu_present_map);
+ num_processors--;
+
+ return (0);
}
EXPORT_SYMBOL(acpi_unmap_lsapic);
@@ -580,6 +648,8 @@ static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
{
struct acpi_table_hpet *hpet_tbl;
+ struct resource *hpet_res;
+ resource_size_t res_start;
if (!phys || !size)
return -EINVAL;
@@ -595,12 +665,26 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
"memory.\n");
return -1;
}
+
+#define HPET_RESOURCE_NAME_SIZE 9
+ hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
+ if (hpet_res) {
+ memset(hpet_res, 0, sizeof(*hpet_res));
+ hpet_res->name = (void *)&hpet_res[1];
+ hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE,
+ "HPET %u", hpet_tbl->number);
+ hpet_res->end = (1 * 1024) - 1;
+ }
+
#ifdef CONFIG_X86_64
vxtime.hpet_address = hpet_tbl->addr.addrl |
((long)hpet_tbl->addr.addrh << 32);
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, vxtime.hpet_address);
+
+ res_start = vxtime.hpet_address;
#else /* X86 */
{
extern unsigned long hpet_address;
@@ -608,9 +692,17 @@ static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
hpet_address = hpet_tbl->addr.addrl;
printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
hpet_tbl->id, hpet_address);
+
+ res_start = hpet_address;
}
#endif /* X86 */
+ if (hpet_res) {
+ hpet_res->start = res_start;
+ hpet_res->end += res_start;
+ insert_resource(&iomem_resource, hpet_res);
+ }
+
return 0;
}
#else
@@ -861,8 +953,6 @@ static void __init acpi_process_madt(void)
return;
}
-extern int acpi_force;
-
#ifdef __i386__
static int __init disable_acpi_irq(struct dmi_system_id *d)
@@ -1164,3 +1254,75 @@ int __init acpi_boot_init(void)
return 0;
}
+
+static int __init parse_acpi(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ /* "acpi=off" disables both ACPI table parsing and interpreter */
+ if (strcmp(arg, "off") == 0) {
+ disable_acpi();
+ }
+ /* acpi=force to over-ride black-list */
+ else if (strcmp(arg, "force") == 0) {
+ acpi_force = 1;
+ acpi_ht = 1;
+ acpi_disabled = 0;
+ }
+ /* acpi=strict disables out-of-spec workarounds */
+ else if (strcmp(arg, "strict") == 0) {
+ acpi_strict = 1;
+ }
+ /* Limit ACPI just to boot-time to enable HT */
+ else if (strcmp(arg, "ht") == 0) {
+ if (!acpi_force)
+ disable_acpi();
+ acpi_ht = 1;
+ }
+ /* "acpi=noirq" disables ACPI interrupt routing */
+ else if (strcmp(arg, "noirq") == 0) {
+ acpi_noirq_set();
+ } else {
+ /* Core will printk when we return error. */
+ return -EINVAL;
+ }
+ return 0;
+}
+early_param("acpi", parse_acpi);
+
+/* FIXME: Using pci= for an ACPI parameter is a travesty. */
+static int __init parse_pci(char *arg)
+{
+ if (arg && strcmp(arg, "noacpi") == 0)
+ acpi_disable_pci();
+ return 0;
+}
+early_param("pci", parse_pci);
+
+#ifdef CONFIG_X86_IO_APIC
+static int __init parse_acpi_skip_timer_override(char *arg)
+{
+ acpi_skip_timer_override = 1;
+ return 0;
+}
+early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
+#endif /* CONFIG_X86_IO_APIC */
+
+static int __init setup_acpi_sci(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "edge"))
+ acpi_sci_flags.trigger = 1;
+ else if (!strcmp(s, "level"))
+ acpi_sci_flags.trigger = 3;
+ else if (!strcmp(s, "high"))
+ acpi_sci_flags.polarity = 1;
+ else if (!strcmp(s, "low"))
+ acpi_sci_flags.polarity = 3;
+ else
+ return -EINVAL;
+ return 0;
+}
+early_param("acpi_sci", setup_acpi_sci);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index 1649a175a206..fe799b11ac0a 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -48,7 +48,11 @@ void __init check_acpi_pci(void)
int num, slot, func;
/* Assume the machine supports type 1. If not it will
- always read ffffffff and should not have any side effect. */
+ always read ffffffff and should not have any side effect.
+ Actually a few buggy systems can machine check. Allow the user
+ to disable it by command line option at least -AK */
+ if (!early_pci_allowed())
+ return;
/* Poor man's PCI discovery */
for (num = 0; num < 32; num++) {
diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S
index 9f408eee4e6f..b781b38131c0 100644
--- a/arch/i386/kernel/acpi/wakeup.S
+++ b/arch/i386/kernel/acpi/wakeup.S
@@ -292,7 +292,10 @@ ENTRY(do_suspend_lowlevel)
pushl $3
call acpi_enter_sleep_state
addl $4, %esp
- ret
+
+# In case of S3 failure, we'll emerge here. Jump
+# to ret_point to recover
+ jmp ret_point
.p2align 4,,7
ret_point:
call restore_registers
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 5cbd6f99fb2a..28ab80649764 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -4,27 +4,41 @@
#include <asm/alternative.h>
#include <asm/sections.h>
-#define DEBUG 0
-#if DEBUG
-# define DPRINTK(fmt, args...) printk(fmt, args)
-#else
-# define DPRINTK(fmt, args...)
-#endif
+static int no_replacement = 0;
+static int smp_alt_once = 0;
+static int debug_alternative = 0;
+
+static int __init noreplacement_setup(char *s)
+{
+ no_replacement = 1;
+ return 1;
+}
+static int __init bootonly(char *str)
+{
+ smp_alt_once = 1;
+ return 1;
+}
+static int __init debug_alt(char *str)
+{
+ debug_alternative = 1;
+ return 1;
+}
+
+__setup("noreplacement", noreplacement_setup);
+__setup("smp-alt-boot", bootonly);
+__setup("debug-alternative", debug_alt);
+
+#define DPRINTK(fmt, args...) if (debug_alternative) \
+ printk(KERN_DEBUG fmt, args)
+#ifdef GENERIC_NOP1
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
asm("\t.data\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8);
-asm("\t.data\nk8nops: "
- K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8);
-asm("\t.data\nk7nops: "
- K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8);
-
-extern unsigned char intelnops[], k8nops[], k7nops[];
+extern unsigned char intelnops[];
static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
@@ -36,6 +50,13 @@ static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
+#endif
+
+#ifdef K8_NOP1
+asm("\t.data\nk8nops: "
+ K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+ K8_NOP7 K8_NOP8);
+extern unsigned char k8nops[];
static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
@@ -47,6 +68,13 @@ static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
+#endif
+
+#ifdef K7_NOP1
+asm("\t.data\nk7nops: "
+ K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+ K7_NOP7 K7_NOP8);
+extern unsigned char k7nops[];
static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
@@ -58,6 +86,18 @@ static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
+#endif
+
+#ifdef CONFIG_X86_64
+
+extern char __vsyscall_0;
+static inline unsigned char** find_nop_table(void)
+{
+ return k8_nops;
+}
+
+#else /* CONFIG_X86_64 */
+
static struct nop {
int cpuid;
unsigned char **noptable;
@@ -67,14 +107,6 @@ static struct nop {
{ -1, NULL }
};
-
-extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
-
-extern u8 __smp_alt_begin[], __smp_alt_end[];
-
-
static unsigned char** find_nop_table(void)
{
unsigned char **noptable = intel_nops;
@@ -89,6 +121,14 @@ static unsigned char** find_nop_table(void)
return noptable;
}
+#endif /* CONFIG_X86_64 */
+
+extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
+extern u8 *__smp_locks[], *__smp_locks_end[];
+
+extern u8 __smp_alt_begin[], __smp_alt_end[];
+
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
@@ -99,6 +139,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
unsigned char **noptable = find_nop_table();
struct alt_instr *a;
+ u8 *instr;
int diff, i, k;
DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
@@ -106,7 +147,16 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
BUG_ON(a->replacementlen > a->instrlen);
if (!boot_cpu_has(a->cpuid))
continue;
- memcpy(a->instr, a->replacement, a->replacementlen);
+ instr = a->instr;
+#ifdef CONFIG_X86_64
+ /* vsyscall code is not mapped yet. resolve it manually. */
+ if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
+ instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
+ DPRINTK("%s: vsyscall fixup: %p => %p\n",
+ __FUNCTION__, a->instr, instr);
+ }
+#endif
+ memcpy(instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
/* Pad the rest with nops */
for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
@@ -118,6 +168,8 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
}
}
+#ifdef CONFIG_SMP
+
static void alternatives_smp_save(struct alt_instr *start, struct alt_instr *end)
{
struct alt_instr *a;
@@ -186,14 +238,6 @@ struct smp_alt_module {
static LIST_HEAD(smp_alt_modules);
static DEFINE_SPINLOCK(smp_alt);
-static int smp_alt_once = 0;
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-
void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
void *text, void *text_end)
@@ -201,6 +245,9 @@ void alternatives_smp_module_add(struct module *mod, char *name,
struct smp_alt_module *smp;
unsigned long flags;
+ if (no_replacement)
+ return;
+
if (smp_alt_once) {
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(locks, locks_end,
@@ -235,7 +282,7 @@ void alternatives_smp_module_del(struct module *mod)
struct smp_alt_module *item;
unsigned long flags;
- if (smp_alt_once)
+ if (no_replacement || smp_alt_once)
return;
spin_lock_irqsave(&smp_alt, flags);
@@ -256,7 +303,17 @@ void alternatives_smp_switch(int smp)
struct smp_alt_module *mod;
unsigned long flags;
- if (smp_alt_once)
+#ifdef CONFIG_LOCKDEP
+ /*
+ * A not yet fixed binutils section handling bug prevents
+ * alternatives-replacement from working reliably, so turn
+ * it off:
+ */
+ printk("lockdep: not fixing up alternatives.\n");
+ return;
+#endif
+
+ if (no_replacement || smp_alt_once)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
@@ -283,8 +340,17 @@ void alternatives_smp_switch(int smp)
spin_unlock_irqrestore(&smp_alt, flags);
}
+#endif
+
void __init alternative_instructions(void)
{
+ if (no_replacement) {
+ printk(KERN_INFO "(SMP-)alternatives turned off\n");
+ free_init_pages("SMP alternatives",
+ (unsigned long)__smp_alt_begin,
+ (unsigned long)__smp_alt_end);
+ return;
+ }
apply_alternatives(__alt_instructions, __alt_instructions_end);
/* switch to patch-once-at-boottime-only mode and free the
@@ -297,6 +363,7 @@ void __init alternative_instructions(void)
smp_alt_once = 1;
#endif
+#ifdef CONFIG_SMP
if (smp_alt_once) {
if (1 == num_possible_cpus()) {
printk(KERN_INFO "SMP alternatives: switching to UP code\n");
@@ -318,4 +385,5 @@ void __init alternative_instructions(void)
_text, _etext);
alternatives_smp_switch(0);
}
+#endif
}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 5ab59c12335b..90faae5c5d30 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -14,7 +14,6 @@
* Mikael Pettersson : PM converted to driver model.
*/
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -36,6 +35,7 @@
#include <asm/arch_hooks.h>
#include <asm/hpet.h>
#include <asm/i8253.h>
+#include <asm/nmi.h>
#include <mach_apic.h>
#include <mach_apicdef.h>
@@ -52,7 +52,18 @@ static cpumask_t timer_bcast_ipi;
/*
* Knob to control our willingness to enable the local APIC.
*/
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+
+static inline void lapic_disable(void)
+{
+ enable_local_apic = -1;
+ clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
+}
+
+static inline void lapic_enable(void)
+{
+ enable_local_apic = 1;
+}
/*
* Debug level
@@ -156,7 +167,7 @@ void clear_local_APIC(void)
maxlvt = get_maxlvt();
/*
- * Masking an LVT entry on a P6 can trigger a local APIC error
+ * Masking an LVT entry can trigger a local APIC error
* if the vector is zero. Mask LVTERR first to prevent this.
*/
if (maxlvt >= 3) {
@@ -586,8 +597,7 @@ void __devinit setup_local_APIC(void)
printk("No ESR for 82489DX.\n");
}
- if (nmi_watchdog == NMI_LOCAL_APIC)
- setup_apic_nmi_watchdog();
+ setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
}
@@ -1117,7 +1127,18 @@ void disable_APIC_timer(void)
unsigned long v;
v = apic_read(APIC_LVTT);
- apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+ /*
+ * When an illegal vector value (0-15) is written to an LVT
+ * entry and delivery mode is Fixed, the APIC may signal an
+ * illegal vector error, with out regard to whether the mask
+ * bit is set or whether an interrupt is actually seen on input.
+ *
+ * Boot sequence might call this function when the LVTT has
+ * '0' vector value. So make sure vector field is set to
+ * valid value.
+ */
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write_around(APIC_LVTT, v);
}
}
@@ -1362,3 +1383,18 @@ int __init APIC_init_uniprocessor (void)
return 0;
}
+
+static int __init parse_lapic(char *arg)
+{
+ lapic_enable();
+ return 0;
+}
+early_param("lapic", parse_lapic);
+
+static int __init parse_nolapic(char *arg)
+{
+ lapic_disable();
+ return 0;
+}
+early_param("nolapic", parse_nolapic);
+
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 9e819eb68229..b42f2d914af3 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -201,7 +201,6 @@
* http://www.microsoft.com/hwdev/busbios/amp_12.htm]
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/poll.h>
@@ -226,6 +225,7 @@
#include <linux/smp_lock.h>
#include <linux/dmi.h>
#include <linux/suspend.h>
+#include <linux/kthread.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -403,8 +403,6 @@ static int realmode_power_off = 1;
#else
static int realmode_power_off;
#endif
-static int exit_kapmd __read_mostly;
-static int kapmd_running __read_mostly;
#ifdef CONFIG_APM_ALLOW_INTS
static int allow_ints = 1;
#else
@@ -420,6 +418,8 @@ static const struct desc_struct bad_bios_desc = { 0, 0x00409200 };
static const char driver_version[] = "1.16ac"; /* no spaces */
+static struct task_struct *kapmd_task;
+
/*
* APM event names taken from the APM 1.2 specification. These are
* the message codes that the BIOS uses to tell us about events
@@ -764,9 +764,9 @@ static int apm_do_idle(void)
int idled = 0;
int polling;
- polling = test_thread_flag(TIF_POLLING_NRFLAG);
+ polling = !!(current_thread_info()->status & TS_POLLING);
if (polling) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
}
if (!need_resched()) {
@@ -774,7 +774,7 @@ static int apm_do_idle(void)
ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
}
if (polling)
- set_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status |= TS_POLLING;
if (!idled)
return 0;
@@ -1155,9 +1155,11 @@ out:
static void set_time(void)
{
+ struct timespec ts;
if (got_clock_diff) { /* Must know time zone in order to set clock */
- xtime.tv_sec = get_cmos_time() + clock_cmos_diff;
- xtime.tv_nsec = 0;
+ ts.tv_sec = get_cmos_time() + clock_cmos_diff;
+ ts.tv_nsec = 0;
+ do_settimeofday(&ts);
}
}
@@ -1233,13 +1235,8 @@ static int suspend(int vetoable)
restore_processor_state();
local_irq_disable();
- write_seqlock(&xtime_lock);
- spin_lock(&i8253_lock);
- reinit_timer();
set_time();
-
- spin_unlock(&i8253_lock);
- write_sequnlock(&xtime_lock);
+ reinit_timer();
if (err == APM_NO_ERROR)
err = APM_SUCCESS;
@@ -1366,9 +1363,7 @@ static void check_events(void)
ignore_bounce = 1;
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
- write_seqlock_irq(&xtime_lock);
set_time();
- write_sequnlock_irq(&xtime_lock);
device_resume();
pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
@@ -1384,9 +1379,7 @@ static void check_events(void)
break;
case APM_UPDATE_TIME:
- write_seqlock_irq(&xtime_lock);
set_time();
- write_sequnlock_irq(&xtime_lock);
break;
case APM_CRITICAL_SUSPEND:
@@ -1431,7 +1424,7 @@ static void apm_mainloop(void)
set_current_state(TASK_INTERRUPTIBLE);
for (;;) {
schedule_timeout(APM_CHECK_TIMEOUT);
- if (exit_kapmd)
+ if (kthread_should_stop())
break;
/*
* Ok, check all events, check for idle (and mark us sleeping
@@ -1714,12 +1707,6 @@ static int apm(void *unused)
char * power_stat;
char * bat_stat;
- kapmd_running = 1;
-
- daemonize("kapmd");
-
- current->flags |= PF_NOFREEZE;
-
#ifdef CONFIG_SMP
/* 2002/08/01 - WT
* This is to avoid random crashes at boot time during initialization
@@ -1829,7 +1816,6 @@ static int apm(void *unused)
console_blank_hook = NULL;
#endif
}
- kapmd_running = 0;
return 0;
}
@@ -2228,7 +2214,7 @@ static int __init apm_init(void)
{
struct proc_dir_entry *apm_proc;
struct desc_struct *gdt;
- int ret;
+ int err;
dmi_check_system(apm_dmi_table);
@@ -2337,11 +2323,17 @@ static int __init apm_init(void)
if (apm_proc)
apm_proc->owner = THIS_MODULE;
- ret = kernel_thread(apm, NULL, CLONE_KERNEL | SIGCHLD);
- if (ret < 0) {
- printk(KERN_ERR "apm: disabled - Unable to start kernel thread.\n");
- return -ENOMEM;
+ kapmd_task = kthread_create(apm, NULL, "kapmd");
+ if (IS_ERR(kapmd_task)) {
+ printk(KERN_ERR "apm: disabled - Unable to start kernel "
+ "thread.\n");
+ err = PTR_ERR(kapmd_task);
+ kapmd_task = NULL;
+ remove_proc_entry("apm", NULL);
+ return err;
}
+ kapmd_task->flags |= PF_NOFREEZE;
+ wake_up_process(kapmd_task);
if (num_online_cpus() > 1 && !smp ) {
printk(KERN_NOTICE
@@ -2349,7 +2341,13 @@ static int __init apm_init(void)
return 0;
}
- misc_register(&apm_device);
+ /*
+ * Note we don't actually care if the misc_device cannot be registered.
+ * this driver can do its job without it, even if userspace can't
+ * control it. just log the error
+ */
+ if (misc_register(&apm_device))
+ printk(KERN_WARNING "apm: Could not register misc device.\n");
if (HZ != 100)
idle_period = (idle_period * HZ) / 100;
@@ -2385,9 +2383,10 @@ static void __exit apm_exit(void)
remove_proc_entry("apm", NULL);
if (power_off)
pm_power_off = NULL;
- exit_kapmd = 1;
- while (kapmd_running)
- schedule();
+ if (kapmd_task) {
+ kthread_stop(kapmd_task);
+ kapmd_task = NULL;
+ }
#ifdef CONFIG_PM_LEGACY
pm_active = 0;
#endif
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 36d66e2077d0..c80271f8f084 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -4,6 +4,7 @@
* to extract and format the required data.
*/
+#include <linux/crypto.h>
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/personality.h>
@@ -13,6 +14,7 @@
#include <asm/fixmap.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
+#include <asm/elf.h>
#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -53,6 +55,7 @@ void foo(void)
OFFSET(TI_preempt_count, thread_info, preempt_count);
OFFSET(TI_addr_limit, thread_info, addr_limit);
OFFSET(TI_restart_block, thread_info, restart_block);
+ OFFSET(TI_sysenter_return, thread_info, sysenter_return);
BLANK();
OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
@@ -68,5 +71,7 @@ void foo(void)
sizeof(struct tss_struct));
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
- DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+ DEFINE(VDSO_PRELINK, VDSO_PRELINK);
+
+ OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
}
diff --git a/arch/i386/kernel/bootflag.c b/arch/i386/kernel/bootflag.c
index 4c30ed01f4e1..0b9860530a6b 100644
--- a/arch/i386/kernel/bootflag.c
+++ b/arch/i386/kernel/bootflag.c
@@ -3,7 +3,6 @@
*/
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 786d1a57048b..e4758095d87a 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -22,7 +22,7 @@
extern void vide(void);
__asm__(".align 4\nvide: ret");
-static void __init init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
int mbytes = num_physpages >> (20-PAGE_SHIFT);
@@ -224,25 +224,29 @@ static void __init init_amd(struct cpuinfo_x86 *c)
#ifdef CONFIG_X86_HT
/*
- * On a AMD dual core setup the lower bits of the APIC id
- * distingush the cores. Assumes number of cores is a power
- * of two.
+ * On a AMD multi core setup the lower bits of the APIC id
+ * distingush the cores.
*/
if (c->x86_max_cores > 1) {
int cpu = smp_processor_id();
- unsigned bits = 0;
- while ((1 << bits) < c->x86_max_cores)
- bits++;
- cpu_core_id[cpu] = phys_proc_id[cpu] & ((1<<bits)-1);
- phys_proc_id[cpu] >>= bits;
+ unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
+
+ if (bits == 0) {
+ while ((1 << bits) < c->x86_max_cores)
+ bits++;
+ }
+ c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
+ c->phys_proc_id >>= bits;
printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
- cpu, c->x86_max_cores, cpu_core_id[cpu]);
+ cpu, c->x86_max_cores, c->cpu_core_id);
}
#endif
+ if (cpuid_eax(0x80000000) >= 0x80000006)
+ num_cache_leaves = 3;
}
-static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
@@ -255,7 +259,7 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
return size;
}
-static struct cpu_dev amd_cpu_dev __initdata = {
+static struct cpu_dev amd_cpu_dev __cpuinitdata = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
.c_models = {
@@ -271,7 +275,6 @@ static struct cpu_dev amd_cpu_dev __initdata = {
},
},
.c_init = init_amd,
- .c_identify = generic_identify,
.c_size_cache = amd_size_cache,
};
diff --git a/arch/i386/kernel/cpu/centaur.c b/arch/i386/kernel/cpu/centaur.c
index bd75629dd262..8c25047975c0 100644
--- a/arch/i386/kernel/cpu/centaur.c
+++ b/arch/i386/kernel/cpu/centaur.c
@@ -9,7 +9,7 @@
#ifdef CONFIG_X86_OOSTORE
-static u32 __init power2(u32 x)
+static u32 __cpuinit power2(u32 x)
{
u32 s=1;
while(s<=x)
@@ -22,7 +22,7 @@ static u32 __init power2(u32 x)
* Set up an actual MCR
*/
-static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
+static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
{
u32 lo, hi;
@@ -40,7 +40,7 @@ static void __init centaur_mcr_insert(int reg, u32 base, u32 size, int key)
* Shortcut: We know you can't put 4Gig of RAM on a winchip
*/
-static u32 __init ramtop(void) /* 16388 */
+static u32 __cpuinit ramtop(void) /* 16388 */
{
int i;
u32 top = 0;
@@ -91,7 +91,7 @@ static u32 __init ramtop(void) /* 16388 */
* Compute a set of MCR's to give maximum coverage
*/
-static int __init centaur_mcr_compute(int nr, int key)
+static int __cpuinit centaur_mcr_compute(int nr, int key)
{
u32 mem = ramtop();
u32 root = power2(mem);
@@ -166,7 +166,7 @@ static int __init centaur_mcr_compute(int nr, int key)
return ct;
}
-static void __init centaur_create_optimal_mcr(void)
+static void __cpuinit centaur_create_optimal_mcr(void)
{
int i;
/*
@@ -189,7 +189,7 @@ static void __init centaur_create_optimal_mcr(void)
wrmsr(MSR_IDT_MCR0+i, 0, 0);
}
-static void __init winchip2_create_optimal_mcr(void)
+static void __cpuinit winchip2_create_optimal_mcr(void)
{
u32 lo, hi;
int i;
@@ -227,7 +227,7 @@ static void __init winchip2_create_optimal_mcr(void)
* Handle the MCR key on the Winchip 2.
*/
-static void __init winchip2_unprotect_mcr(void)
+static void __cpuinit winchip2_unprotect_mcr(void)
{
u32 lo, hi;
u32 key;
@@ -239,7 +239,7 @@ static void __init winchip2_unprotect_mcr(void)
wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
}
-static void __init winchip2_protect_mcr(void)
+static void __cpuinit winchip2_protect_mcr(void)
{
u32 lo, hi;
@@ -257,7 +257,7 @@ static void __init winchip2_protect_mcr(void)
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-static void __init init_c3(struct cpuinfo_x86 *c)
+static void __cpuinit init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -303,7 +303,7 @@ static void __init init_c3(struct cpuinfo_x86 *c)
display_cacheinfo(c);
}
-static void __init init_centaur(struct cpuinfo_x86 *c)
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
enum {
ECX8=1<<1,
@@ -442,7 +442,7 @@ static void __init init_centaur(struct cpuinfo_x86 *c)
}
}
-static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
@@ -457,7 +457,7 @@ static unsigned int centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size
return size;
}
-static struct cpu_dev centaur_cpu_dev __initdata = {
+static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
.c_vendor = "Centaur",
.c_ident = { "CentaurHauls" },
.c_init = init_centaur,
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 44f2c5f2dda1..2799baaadf45 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -36,7 +36,7 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
extern int disable_pse;
-static void default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 * c)
{
/* Not much we can do here... */
/* Check if at least it has cpuid */
@@ -49,7 +49,7 @@ static void default_init(struct cpuinfo_x86 * c)
}
}
-static struct cpu_dev default_cpu = {
+static struct cpu_dev __cpuinitdata default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
};
@@ -265,7 +265,7 @@ static void __init early_cpu_detect(void)
}
}
-void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
{
u32 tfms, xlvl;
int ebx;
@@ -294,7 +294,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
c->x86_mask = tfms & 15;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
#else
c->apicid = (ebx >> 24) & 0xFF;
@@ -319,7 +319,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
early_intel_workaround(c);
#ifdef CONFIG_X86_HT
- phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+ c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
#endif
}
@@ -477,11 +477,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
- int cpu = smp_processor_id();
cpuid(1, &eax, &ebx, &ecx, &edx);
-
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
@@ -492,16 +490,17 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
} else if (smp_num_siblings > 1 ) {
if (smp_num_siblings > NR_CPUS) {
- printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings);
+ printk(KERN_WARNING "CPU: Unsupported number of the "
+ "siblings %d", smp_num_siblings);
smp_num_siblings = 1;
return;
}
index_msb = get_count_order(smp_num_siblings);
- phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+ c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
- phys_proc_id[cpu]);
+ c->phys_proc_id);
smp_num_siblings = smp_num_siblings / c->x86_max_cores;
@@ -509,12 +508,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
- cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+ c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
((1 << core_bits) - 1);
if (c->x86_max_cores > 1)
printk(KERN_INFO "CPU: Processor Core ID: %d\n",
- cpu_core_id[cpu]);
+ c->cpu_core_id);
}
}
#endif
@@ -613,6 +612,12 @@ void __cpuinit cpu_init(void)
set_in_cr4(X86_CR4_TSD);
}
+ /* The CPU hotplug case */
+ if (cpu_gdt_descr->address) {
+ gdt = (struct desc_struct *)cpu_gdt_descr->address;
+ memset(gdt, 0, PAGE_SIZE);
+ goto old_gdt;
+ }
/*
* This is a horrible hack to allocate the GDT. The problem
* is that cpu_init() is called really early for the boot CPU
@@ -631,7 +636,7 @@ void __cpuinit cpu_init(void)
local_irq_enable();
}
}
-
+old_gdt:
/*
* Initialize the per-CPU GDT with the boot GDT,
* and set up the GDT descriptor:
@@ -670,7 +675,7 @@ void __cpuinit cpu_init(void)
#endif
/* Clear %fs and %gs. */
- asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+ asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0));
/* Clear all 6 debug registers: */
set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cpu.h b/arch/i386/kernel/cpu/cpu.h
index 5a1d4f163e84..2f6432cef6ff 100644
--- a/arch/i386/kernel/cpu/cpu.h
+++ b/arch/i386/kernel/cpu/cpu.h
@@ -24,7 +24,5 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
-extern void generic_identify(struct cpuinfo_x86 * c);
-
extern void early_intel_workaround(struct cpuinfo_x86 *c);
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index e44a4c6a4fe5..ccc1edff5c97 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -96,6 +96,7 @@ config X86_POWERNOW_K8_ACPI
config X86_GX_SUSPMOD
tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
+ depends on PCI
help
This add the CPUFreq driver for NatSemi Geode processors which
support suspend modulation.
@@ -202,7 +203,7 @@ config X86_LONGRUN
config X86_LONGHAUL
tristate "VIA Cyrix III Longhaul"
select CPU_FREQ_TABLE
- depends on BROKEN
+ depends on ACPI_PROCESSOR
help
This adds the CPUFreq driver for VIA Samuel/CyrixIII,
VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 05668e3598c0..ea19d091fd41 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -24,7 +24,6 @@
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -33,6 +32,7 @@
#include <linux/seq_file.h>
#include <linux/compiler.h>
#include <linux/sched.h> /* current */
+#include <linux/dmi.h>
#include <asm/io.h>
#include <asm/delay.h>
#include <asm/uaccess.h>
@@ -371,11 +371,11 @@ static int acpi_cpufreq_early_init_acpi(void)
dprintk("acpi_cpufreq_early_init\n");
- for_each_cpu(i) {
+ for_each_possible_cpu(i) {
data = kzalloc(sizeof(struct acpi_processor_performance),
GFP_KERNEL);
if (!data) {
- for_each_cpu(j) {
+ for_each_possible_cpu(j) {
kfree(acpi_perf_data[j]);
acpi_perf_data[j] = NULL;
}
@@ -385,10 +385,36 @@ static int acpi_cpufreq_early_init_acpi(void)
}
/* Do initialization in ACPI core */
- acpi_processor_preregister_performance(acpi_perf_data);
+ return acpi_processor_preregister_performance(acpi_perf_data);
+}
+
+/*
+ * Some BIOSes do SW_ANY coordination internally, either set it up in hw
+ * or do it in BIOS firmware and won't inform about it to OS. If not
+ * detected, this has a side effect of making CPU run at a different speed
+ * than OS intended it to run at. Detect it and handle it cleanly.
+ */
+static int bios_with_sw_any_bug;
+
+static int __init sw_any_bug_found(struct dmi_system_id *d)
+{
+ bios_with_sw_any_bug = 1;
return 0;
}
+static struct dmi_system_id __initdata sw_any_bug_dmi_table[] = {
+ {
+ .callback = sw_any_bug_found,
+ .ident = "Supermicro Server X6DLP",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
+ DMI_MATCH(DMI_BIOS_VERSION, "080010"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
+ },
+ },
+ { }
+};
+
static int
acpi_cpufreq_cpu_init (
struct cpufreq_policy *policy)
@@ -418,8 +444,23 @@ acpi_cpufreq_cpu_init (
goto err_free;
perf = data->acpi_data;
- policy->cpus = perf->shared_cpu_map;
policy->shared_type = perf->shared_type;
+ /*
+ * Will let policy->cpus know about dependency only when software
+ * coordination is required.
+ */
+ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+ policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+ policy->cpus = perf->shared_cpu_map;
+ }
+
+#ifdef CONFIG_SMP
+ dmi_check_system(sw_any_bug_dmi_table);
+ if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
+ policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+ policy->cpus = cpu_core_map[cpu];
+ }
+#endif
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -563,16 +604,11 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
static int __init
acpi_cpufreq_init (void)
{
- int result = 0;
-
dprintk("acpi_cpufreq_init\n");
- result = acpi_cpufreq_early_init_acpi();
+ acpi_cpufreq_early_init_acpi();
- if (!result)
- result = cpufreq_register_driver(&acpi_cpufreq_driver);
-
- return (result);
+ return cpufreq_register_driver(&acpi_cpufreq_driver);
}
@@ -584,7 +620,7 @@ acpi_cpufreq_exit (void)
cpufreq_unregister_driver(&acpi_cpufreq_driver);
- for_each_cpu(i) {
+ for_each_possible_cpu(i) {
kfree(acpi_perf_data[i]);
acpi_perf_data[i] = NULL;
}
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index 146f607e9c44..f5cc9f5c9bab 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -27,13 +27,16 @@
#include <linux/moduleparam.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
+#include <linux/pci.h>
#include <linux/slab.h>
#include <linux/string.h>
-#include <linux/pci.h>
#include <asm/msr.h>
#include <asm/timex.h>
#include <asm/io.h>
+#include <asm/acpi.h>
+#include <linux/acpi.h>
+#include <acpi/processor.h>
#include "longhaul.h"
@@ -50,16 +53,26 @@
#define CPU_NEHEMIAH 5
static int cpu_model;
-static unsigned int numscales=16, numvscales;
+static unsigned int numscales=16;
static unsigned int fsb;
-static int minvid, maxvid;
+
+static struct mV_pos *vrm_mV_table;
+static unsigned char *mV_vrm_table;
+struct f_msr {
+ unsigned char vrm;
+};
+static struct f_msr f_msr_table[32];
+
+static unsigned int highest_speed, lowest_speed; /* kHz */
static unsigned int minmult, maxmult;
static int can_scale_voltage;
-static int vrmrev;
+static struct acpi_processor *pr = NULL;
+static struct acpi_processor_cx *cx = NULL;
+static int port22_en;
/* Module parameters */
-static int dont_scale_voltage;
-
+static int scale_voltage;
+static int ignore_latency;
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
@@ -67,7 +80,6 @@ static int dont_scale_voltage;
/* Clock ratios multiplied by 10 */
static int clock_ratio[32];
static int eblcr_table[32];
-static int voltage_table[32];
static unsigned int highest_speed, lowest_speed; /* kHz */
static int longhaul_version;
static struct cpufreq_frequency_table *longhaul_table;
@@ -118,84 +130,67 @@ static int longhaul_get_cpu_mult(void)
return eblcr_table[invalue];
}
+/* For processor with BCR2 MSR */
-static void do_powersaver(union msr_longhaul *longhaul,
- unsigned int clock_ratio_index)
+static void do_longhaul1(unsigned int clock_ratio_index)
{
- struct pci_dev *dev;
- unsigned long flags;
- unsigned int tmp_mask;
- int version;
- int i;
- u16 pci_cmd;
- u16 cmd_state[64];
-
- switch (cpu_model) {
- case CPU_EZRA_T:
- version = 3;
- break;
- case CPU_NEHEMIAH:
- version = 0xf;
- break;
- default:
- return;
- }
-
- rdmsrl(MSR_VIA_LONGHAUL, longhaul->val);
- longhaul->bits.SoftBusRatio = clock_ratio_index & 0xf;
- longhaul->bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
- longhaul->bits.EnableSoftBusRatio = 1;
- longhaul->bits.RevisionKey = 0;
-
- preempt_disable();
- local_irq_save(flags);
-
- /*
- * get current pci bus master state for all devices
- * and clear bus master bit
- */
- dev = NULL;
- i = 0;
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (dev != NULL) {
- pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
- cmd_state[i++] = pci_cmd;
- pci_cmd &= ~PCI_COMMAND_MASTER;
- pci_write_config_word(dev, PCI_COMMAND, pci_cmd);
- }
- } while (dev != NULL);
+ union msr_bcr2 bcr2;
- tmp_mask=inb(0x21); /* works on C3. save mask. */
- outb(0xFE,0x21); /* TMR0 only */
- outb(0xFF,0x80); /* delay */
+ rdmsrl(MSR_VIA_BCR2, bcr2.val);
+ /* Enable software clock multiplier */
+ bcr2.bits.ESOFTBF = 1;
+ bcr2.bits.CLOCKMUL = clock_ratio_index;
+ /* Sync to timer tick */
safe_halt();
- wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
+ /* Change frequency on next halt or sleep */
+ wrmsrl(MSR_VIA_BCR2, bcr2.val);
+ /* Invoke transition */
+ ACPI_FLUSH_CPU_CACHE();
halt();
+ /* Disable software clock multiplier */
local_irq_disable();
+ rdmsrl(MSR_VIA_BCR2, bcr2.val);
+ bcr2.bits.ESOFTBF = 0;
+ wrmsrl(MSR_VIA_BCR2, bcr2.val);
+}
- outb(tmp_mask,0x21); /* restore mask */
+/* For processor with Longhaul MSR */
- /* restore pci bus master state for all devices */
- dev = NULL;
- i = 0;
- do {
- dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
- if (dev != NULL) {
- pci_cmd = cmd_state[i++];
- pci_write_config_byte(dev, PCI_COMMAND, pci_cmd);
- }
- } while (dev != NULL);
- local_irq_restore(flags);
- preempt_enable();
+static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
+{
+ union msr_longhaul longhaul;
+ u32 t;
+
+ rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+ longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
+ longhaul.bits.SoftBusRatio = clock_ratio_index & 0xf;
+ longhaul.bits.SoftBusRatio4 = (clock_ratio_index & 0x10) >> 4;
+ longhaul.bits.EnableSoftBusRatio = 1;
+
+ if (can_scale_voltage) {
+ longhaul.bits.SoftVID = f_msr_table[clock_ratio_index].vrm;
+ longhaul.bits.EnableSoftVID = 1;
+ }
- /* disable bus ratio bit */
- rdmsrl(MSR_VIA_LONGHAUL, longhaul->val);
- longhaul->bits.EnableSoftBusRatio = 0;
- longhaul->bits.RevisionKey = version;
- wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
+ /* Sync to timer tick */
+ safe_halt();
+ /* Change frequency on next halt or sleep */
+ wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+ ACPI_FLUSH_CPU_CACHE();
+ /* Invoke C3 */
+ inb(cx_address);
+ /* Dummy op - must do something useless after P_LVL3 read */
+ t = inl(acpi_fadt.xpm_tmr_blk.address);
+
+ /* Disable bus ratio bit */
+ local_irq_disable();
+ longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
+ longhaul.bits.EnableSoftBusRatio = 0;
+ longhaul.bits.EnableSoftBSEL = 0;
+ longhaul.bits.EnableSoftVID = 0;
+ wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
}
/**
@@ -209,9 +204,9 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
{
int speed, mult;
struct cpufreq_freqs freqs;
- union msr_longhaul longhaul;
- union msr_bcr2 bcr2;
static unsigned int old_ratio=-1;
+ unsigned long flags;
+ unsigned int pic1_mask, pic2_mask;
if (old_ratio == clock_ratio_index)
return;
@@ -234,6 +229,23 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
dprintk ("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
fsb, mult/10, mult%10, print_speed(speed/1000));
+ preempt_disable();
+ local_irq_save(flags);
+
+ pic2_mask = inb(0xA1);
+ pic1_mask = inb(0x21); /* works on C3. save mask. */
+ outb(0xFF,0xA1); /* Overkill */
+ outb(0xFE,0x21); /* TMR0 only */
+
+ if (pr->flags.bm_control) {
+ /* Disable bus master arbitration */
+ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1,
+ ACPI_MTX_DO_NOT_LOCK);
+ } else if (port22_en) {
+ /* Disable AGP and PCI arbiters */
+ outb(3, 0x22);
+ }
+
switch (longhaul_version) {
/*
@@ -245,20 +257,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
*/
case TYPE_LONGHAUL_V1:
case TYPE_LONGHAUL_V2:
- rdmsrl (MSR_VIA_BCR2, bcr2.val);
- /* Enable software clock multiplier */
- bcr2.bits.ESOFTBF = 1;
- bcr2.bits.CLOCKMUL = clock_ratio_index;
- local_irq_disable();
- wrmsrl (MSR_VIA_BCR2, bcr2.val);
- safe_halt();
-
- /* Disable software clock multiplier */
- rdmsrl (MSR_VIA_BCR2, bcr2.val);
- bcr2.bits.ESOFTBF = 0;
- local_irq_disable();
- wrmsrl (MSR_VIA_BCR2, bcr2.val);
- local_irq_enable();
+ do_longhaul1(clock_ratio_index);
break;
/*
@@ -273,10 +272,28 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
* to work in practice.
*/
case TYPE_POWERSAVER:
- do_powersaver(&longhaul, clock_ratio_index);
+ /* Don't allow wakeup */
+ acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0,
+ ACPI_MTX_DO_NOT_LOCK);
+ do_powersaver(cx->address, clock_ratio_index);
break;
}
+ if (pr->flags.bm_control) {
+ /* Enable bus master arbitration */
+ acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0,
+ ACPI_MTX_DO_NOT_LOCK);
+ } else if (port22_en) {
+ /* Enable arbiters */
+ outb(0, 0x22);
+ }
+
+ outb(pic2_mask,0xA1); /* restore mask */
+ outb(pic1_mask,0x21);
+
+ local_irq_restore(flags);
+ preempt_enable();
+
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -324,9 +341,11 @@ static int guess_fsb(void)
static int __init longhaul_get_ranges(void)
{
unsigned long invalue;
- unsigned int multipliers[32]= {
- 50,30,40,100,55,35,45,95,90,70,80,60,120,75,85,65,
- -1,110,120,-1,135,115,125,105,130,150,160,140,-1,155,-1,145 };
+ unsigned int ezra_t_multipliers[32]= {
+ 90, 30, 40, 100, 55, 35, 45, 95,
+ 50, 70, 80, 60, 120, 75, 85, 65,
+ -1, 110, 120, -1, 135, 115, 125, 105,
+ 130, 150, 160, 140, -1, 155, -1, 145 };
unsigned int j, k = 0;
union msr_longhaul longhaul;
unsigned long lo, hi;
@@ -355,13 +374,13 @@ static int __init longhaul_get_ranges(void)
invalue = longhaul.bits.MaxMHzBR;
if (longhaul.bits.MaxMHzBR4)
invalue += 16;
- maxmult=multipliers[invalue];
+ maxmult=ezra_t_multipliers[invalue];
invalue = longhaul.bits.MinMHzBR;
if (longhaul.bits.MinMHzBR4 == 1)
minmult = 30;
else
- minmult = multipliers[invalue];
+ minmult = ezra_t_multipliers[invalue];
fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB];
break;
}
@@ -446,53 +465,57 @@ static int __init longhaul_get_ranges(void)
static void __init longhaul_setup_voltagescaling(void)
{
union msr_longhaul longhaul;
+ struct mV_pos minvid, maxvid;
+ unsigned int j, speed, pos, kHz_step, numvscales;
- rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
-
- if (!(longhaul.bits.RevisionID & 1))
+ rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+ if (!(longhaul.bits.RevisionID & 1)) {
+ printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
return;
+ }
+
+ if (!longhaul.bits.VRMRev) {
+ printk (KERN_INFO PFX "VRM 8.5\n");
+ vrm_mV_table = &vrm85_mV[0];
+ mV_vrm_table = &mV_vrm85[0];
+ } else {
+ printk (KERN_INFO PFX "Mobile VRM\n");
+ vrm_mV_table = &mobilevrm_mV[0];
+ mV_vrm_table = &mV_mobilevrm[0];
+ }
- minvid = longhaul.bits.MinimumVID;
- maxvid = longhaul.bits.MaximumVID;
- vrmrev = longhaul.bits.VRMRev;
+ minvid = vrm_mV_table[longhaul.bits.MinimumVID];
+ maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
+ numvscales = maxvid.pos - minvid.pos + 1;
+ kHz_step = (highest_speed - lowest_speed) / numvscales;
- if (minvid == 0 || maxvid == 0) {
+ if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
printk (KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
"Voltage scaling disabled.\n",
- minvid/1000, minvid%1000, maxvid/1000, maxvid%1000);
+ minvid.mV/1000, minvid.mV%1000, maxvid.mV/1000, maxvid.mV%1000);
return;
}
- if (minvid == maxvid) {
+ if (minvid.mV == maxvid.mV) {
printk (KERN_INFO PFX "Claims to support voltage scaling but min & max are "
"both %d.%03d. Voltage scaling disabled\n",
- maxvid/1000, maxvid%1000);
+ maxvid.mV/1000, maxvid.mV%1000);
return;
}
- if (vrmrev==0) {
- dprintk ("VRM 8.5\n");
- memcpy (voltage_table, vrm85scales, sizeof(voltage_table));
- numvscales = (voltage_table[maxvid]-voltage_table[minvid])/25;
- } else {
- dprintk ("Mobile VRM\n");
- memcpy (voltage_table, mobilevrmscales, sizeof(voltage_table));
- numvscales = (voltage_table[maxvid]-voltage_table[minvid])/5;
+ printk(KERN_INFO PFX "Max VID=%d.%03d Min VID=%d.%03d, %d possible voltage scales\n",
+ maxvid.mV/1000, maxvid.mV%1000,
+ minvid.mV/1000, minvid.mV%1000,
+ numvscales);
+
+ j = 0;
+ while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
+ speed = longhaul_table[j].frequency;
+ pos = (speed - lowest_speed) / kHz_step + minvid.pos;
+ f_msr_table[longhaul_table[j].index].vrm = mV_vrm_table[pos];
+ j++;
}
- /* Current voltage isn't readable at first, so we need to
- set it to a known value. The spec says to use maxvid */
- longhaul.bits.RevisionKey = longhaul.bits.RevisionID; /* FIXME: This is bad. */
- longhaul.bits.EnableSoftVID = 1;
- longhaul.bits.SoftVID = maxvid;
- wrmsrl (MSR_VIA_LONGHAUL, longhaul.val);
-
- minvid = voltage_table[minvid];
- maxvid = voltage_table[maxvid];
-
- dprintk ("Min VID=%d.%03d Max VID=%d.%03d, %d possible voltage scales\n",
- maxvid/1000, maxvid%1000, minvid/1000, minvid%1000, numvscales);
-
can_scale_voltage = 1;
}
@@ -527,6 +550,38 @@ static unsigned int longhaul_get(unsigned int cpu)
return calc_speed(longhaul_get_cpu_mult());
}
+static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
+ u32 nesting_level,
+ void *context, void **return_value)
+{
+ struct acpi_device *d;
+
+ if ( acpi_bus_get_device(obj_handle, &d) ) {
+ return 0;
+ }
+ *return_value = (void *)acpi_driver_data(d);
+ return 1;
+}
+
+/* VIA don't support PM2 reg, but have something similar */
+static int enable_arbiter_disable(void)
+{
+ struct pci_dev *dev;
+ u8 pci_cmd;
+
+ /* Find PLE133 host bridge */
+ dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, NULL);
+ if (dev != NULL) {
+ /* Enable access to port 0x22 */
+ pci_read_config_byte(dev, 0x78, &pci_cmd);
+ if ( !(pci_cmd & 1<<7) ) {
+ pci_cmd |= 1<<7;
+ pci_write_config_byte(dev, 0x78, pci_cmd);
+ }
+ return 1;
+ }
+ return 0;
+}
static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
{
@@ -534,6 +589,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
char *cpuname=NULL;
int ret;
+ /* Check what we have on this motherboard */
switch (c->x86_model) {
case 6:
cpu_model = CPU_SAMUEL;
@@ -615,12 +671,36 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
break;
};
+ /* Find ACPI data for processor */
+ acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX,
+ &longhaul_walk_callback, NULL, (void *)&pr);
+ if (pr == NULL)
+ goto err_acpi;
+
+ if (longhaul_version == TYPE_POWERSAVER) {
+ /* Check ACPI support for C3 state */
+ cx = &pr->power.states[ACPI_STATE_C3];
+ if (cx->address == 0 ||
+ (cx->latency > 1000 && ignore_latency == 0) )
+ goto err_acpi;
+
+ } else {
+ /* Check ACPI support for bus master arbiter disable */
+ if (!pr->flags.bm_control) {
+ if (!enable_arbiter_disable()) {
+ printk(KERN_ERR PFX "No ACPI support. No VT8601 host bridge. Aborting.\n");
+ return -ENODEV;
+ } else
+ port22_en = 1;
+ }
+ }
+
ret = longhaul_get_ranges();
if (ret != 0)
return ret;
if ((longhaul_version==TYPE_LONGHAUL_V2 || longhaul_version==TYPE_POWERSAVER) &&
- (dont_scale_voltage==0))
+ (scale_voltage != 0))
longhaul_setup_voltagescaling();
policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
@@ -634,6 +714,10 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
return 0;
+
+err_acpi:
+ printk(KERN_ERR PFX "No ACPI support for CPU frequency changes.\n");
+ return -ENODEV;
}
static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
@@ -666,6 +750,18 @@ static int __init longhaul_init(void)
if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
return -ENODEV;
+#ifdef CONFIG_SMP
+ if (num_online_cpus() > 1) {
+ return -ENODEV;
+ printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n");
+ }
+#endif
+#ifdef CONFIG_X86_IO_APIC
+ if (cpu_has_apic) {
+ printk(KERN_ERR PFX "APIC detected. Longhaul is currently broken in this configuration.\n");
+ return -ENODEV;
+ }
+#endif
switch (c->x86_model) {
case 6 ... 9:
return cpufreq_register_driver(&longhaul_driver);
@@ -692,13 +788,14 @@ static void __exit longhaul_exit(void)
kfree(longhaul_table);
}
-module_param (dont_scale_voltage, int, 0644);
-MODULE_PARM_DESC(dont_scale_voltage, "Don't scale voltage of processor");
+module_param (scale_voltage, int, 0644);
+MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
+module_param(ignore_latency, int, 0644);
+MODULE_PARM_DESC(ignore_latency, "Skip ACPI C3 latency test");
MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
MODULE_LICENSE ("GPL");
-module_init(longhaul_init);
+late_initcall(longhaul_init);
module_exit(longhaul_exit);
-
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.h b/arch/i386/kernel/cpu/cpufreq/longhaul.h
index d3a95d77ee85..bc4682aad69b 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.h
@@ -450,17 +450,45 @@ static int __initdata nehemiah_c_eblcr[32] = {
* Voltage scales. Div/Mod by 1000 to get actual voltage.
* Which scale to use depends on the VRM type in use.
*/
-static int __initdata vrm85scales[32] = {
- 1250, 1200, 1150, 1100, 1050, 1800, 1750, 1700,
- 1650, 1600, 1550, 1500, 1450, 1400, 1350, 1300,
- 1275, 1225, 1175, 1125, 1075, 1825, 1775, 1725,
- 1675, 1625, 1575, 1525, 1475, 1425, 1375, 1325,
+
+struct mV_pos {
+ unsigned short mV;
+ unsigned short pos;
+};
+
+static struct mV_pos __initdata vrm85_mV[32] = {
+ {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
+ {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
+ {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
+ {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
+ {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
+ {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
+ {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
+ {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
+};
+
+static unsigned char __initdata mV_vrm85[32] = {
+ 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
+ 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
+ 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
+ 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
+};
+
+static struct mV_pos __initdata mobilevrm_mV[32] = {
+ {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
+ {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
+ {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
+ {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
+ {975, 15}, {950, 14}, {925, 13}, {900, 12},
+ {875, 11}, {850, 10}, {825, 9}, {800, 8},
+ {775, 7}, {750, 6}, {725, 5}, {700, 4},
+ {675, 3}, {650, 2}, {625, 1}, {600, 0}
};
-static int __initdata mobilevrmscales[32] = {
- 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
- 1600, 1550, 1500, 1450, 1500, 1350, 1300, -1,
- 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
- 1075, 1050, 1025, 1000, 975, 950, 925, -1,
+static unsigned char __initdata mV_mobilevrm[32] = {
+ 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
+ 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+ 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+ 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
};
diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
index ab6504efd801..304d2eaa4a1b 100644
--- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -20,7 +20,6 @@
*
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c
index 694d4793bf6a..54382760983a 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c
@@ -12,7 +12,6 @@
* - We disable half multipliers if ACPI is used on A0 stepping CPUs.
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index 31c3a5baaa7f..7a9325349e94 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -17,13 +17,13 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
-#include <linux/config.h>
#include <linux/sched.h> /* current */
#include <linux/delay.h>
#include <linux/compiler.h>
#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
#include <linux/acpi.h>
+#include <linux/dmi.h>
#include <acpi/processor.h>
#endif
@@ -361,11 +361,11 @@ static int centrino_cpu_early_init_acpi(void)
unsigned int i, j;
struct acpi_processor_performance *data;
- for_each_cpu(i) {
+ for_each_possible_cpu(i) {
data = kzalloc(sizeof(struct acpi_processor_performance),
GFP_KERNEL);
if (!data) {
- for_each_cpu(j) {
+ for_each_possible_cpu(j) {
kfree(acpi_perf_data[j]);
acpi_perf_data[j] = NULL;
}
@@ -378,6 +378,35 @@ static int centrino_cpu_early_init_acpi(void)
return 0;
}
+
+/*
+ * Some BIOSes do SW_ANY coordination internally, either set it up in hw
+ * or do it in BIOS firmware and won't inform about it to OS. If not
+ * detected, this has a side effect of making CPU run at a different speed
+ * than OS intended it to run at. Detect it and handle it cleanly.
+ */
+static int bios_with_sw_any_bug;
+static int __init sw_any_bug_found(struct dmi_system_id *d)
+{
+ bios_with_sw_any_bug = 1;
+ return 0;
+}
+
+
+static struct dmi_system_id sw_any_bug_dmi_table[] = {
+ {
+ .callback = sw_any_bug_found,
+ .ident = "Supermicro Server X6DLP",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
+ DMI_MATCH(DMI_BIOS_VERSION, "080010"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
+ },
+ },
+ { }
+};
+
+
/*
* centrino_cpu_init_acpi - register with ACPI P-States library
*
@@ -399,8 +428,24 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
dprintk(PFX "obtaining ACPI data failed\n");
return -EIO;
}
- policy->cpus = p->shared_cpu_map;
+
policy->shared_type = p->shared_type;
+ /*
+ * Will let policy->cpus know about dependency only when software
+ * coordination is required.
+ */
+ if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+ policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+ policy->cpus = p->shared_cpu_map;
+ }
+
+#ifdef CONFIG_SMP
+ dmi_check_system(sw_any_bug_dmi_table);
+ if (bios_with_sw_any_bug && cpus_weight(policy->cpus) == 1) {
+ policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+ policy->cpus = cpu_core_map[cpu];
+ }
+#endif
/* verify the acpi_data */
if (p->state_count <= 1) {
@@ -805,7 +850,7 @@ static void __exit centrino_exit(void)
cpufreq_unregister_driver(&centrino_driver);
#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_ACPI
- for_each_cpu(j) {
+ for_each_possible_cpu(j) {
kfree(acpi_perf_data[j]);
acpi_perf_data[j] = NULL;
}
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index fc32c8028e24..c0c3b59de32c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -12,7 +12,7 @@
/*
* Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
*/
-static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
unsigned long flags;
@@ -52,25 +52,25 @@ static void __init do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
* Actually since bugs.h doesn't even reference this perhaps someone should
* fix the documentation ???
*/
-static unsigned char Cx86_dir0_msb __initdata = 0;
+static unsigned char Cx86_dir0_msb __cpuinitdata = 0;
-static char Cx86_model[][9] __initdata = {
+static char Cx86_model[][9] __cpuinitdata = {
"Cx486", "Cx486", "5x86 ", "6x86", "MediaGX ", "6x86MX ",
"M II ", "Unknown"
};
-static char Cx486_name[][5] __initdata = {
+static char Cx486_name[][5] __cpuinitdata = {
"SLC", "DLC", "SLC2", "DLC2", "SRx", "DRx",
"SRx2", "DRx2"
};
-static char Cx486S_name[][4] __initdata = {
+static char Cx486S_name[][4] __cpuinitdata = {
"S", "S2", "Se", "S2e"
};
-static char Cx486D_name[][4] __initdata = {
+static char Cx486D_name[][4] __cpuinitdata = {
"DX", "DX2", "?", "?", "?", "DX4"
};
-static char Cx86_cb[] __initdata = "?.5x Core/Bus Clock";
-static char cyrix_model_mult1[] __initdata = "12??43";
-static char cyrix_model_mult2[] __initdata = "12233445";
+static char Cx86_cb[] __cpuinitdata = "?.5x Core/Bus Clock";
+static char cyrix_model_mult1[] __cpuinitdata = "12??43";
+static char cyrix_model_mult2[] __cpuinitdata = "12233445";
/*
* Reset the slow-loop (SLOP) bit on the 686(L) which is set by some old
@@ -82,7 +82,7 @@ static char cyrix_model_mult2[] __initdata = "12233445";
extern void calibrate_delay(void) __init;
-static void __init check_cx686_slop(struct cpuinfo_x86 *c)
+static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
@@ -107,7 +107,7 @@ static void __init check_cx686_slop(struct cpuinfo_x86 *c)
}
-static void __init set_cx86_reorder(void)
+static void __cpuinit set_cx86_reorder(void)
{
u8 ccr3;
@@ -122,7 +122,7 @@ static void __init set_cx86_reorder(void)
setCx86(CX86_CCR3, ccr3);
}
-static void __init set_cx86_memwb(void)
+static void __cpuinit set_cx86_memwb(void)
{
u32 cr0;
@@ -137,7 +137,7 @@ static void __init set_cx86_memwb(void)
setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
}
-static void __init set_cx86_inc(void)
+static void __cpuinit set_cx86_inc(void)
{
unsigned char ccr3;
@@ -158,7 +158,7 @@ static void __init set_cx86_inc(void)
* Configure later MediaGX and/or Geode processor.
*/
-static void __init geode_configure(void)
+static void __cpuinit geode_configure(void)
{
unsigned long flags;
u8 ccr3, ccr4;
@@ -184,14 +184,14 @@ static void __init geode_configure(void)
#ifdef CONFIG_PCI
-static struct pci_device_id __initdata cyrix_55x0[] = {
+static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
{ PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
{ },
};
#endif
-static void __init init_cyrix(struct cpuinfo_x86 *c)
+static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
{
unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
char *buf = c->x86_model_id;
@@ -346,7 +346,7 @@ static void __init init_cyrix(struct cpuinfo_x86 *c)
/*
* Handle National Semiconductor branded processors
*/
-static void __init init_nsc(struct cpuinfo_x86 *c)
+static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
{
/* There may be GX1 processors in the wild that are branded
* NSC and not Cyrix.
@@ -354,7 +354,7 @@ static void __init init_nsc(struct cpuinfo_x86 *c)
* This function only handles the GX processor, and kicks every
* thing else to the Cyrix init function above - that should
* cover any processors that might have been branded differently
- * after NSC aquired Cyrix.
+ * after NSC acquired Cyrix.
*
* If this breaks your GX1 horribly, please e-mail
* info-linux@ldcmail.amd.com to tell us.
@@ -394,7 +394,7 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void cyrix_identify(struct cpuinfo_x86 * c)
+static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
{
/* Detect Cyrix with disabled CPUID */
if ( c->x86 == 4 && test_cyrix_52div() ) {
@@ -427,10 +427,9 @@ static void cyrix_identify(struct cpuinfo_x86 * c)
local_irq_restore(flags);
}
}
- generic_identify(c);
}
-static struct cpu_dev cyrix_cpu_dev __initdata = {
+static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
.c_vendor = "Cyrix",
.c_ident = { "CyrixInstead" },
.c_init = init_cyrix,
@@ -453,11 +452,10 @@ static int __init cyrix_exit_cpu(void)
late_initcall(cyrix_exit_cpu);
-static struct cpu_dev nsc_cpu_dev __initdata = {
+static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
.c_vendor = "NSC",
.c_ident = { "Geode by NSC" },
.c_init = init_nsc,
- .c_identify = generic_identify,
};
int __init nsc_init_cpu(void)
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 5386b29bb5a5..94a95aa5227e 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -122,6 +121,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
select_idle_routine(c);
l2 = init_intel_cacheinfo(c);
+ if (c->cpuid_level > 9 ) {
+ unsigned eax = cpuid_eax(10);
+ /* Check for version and the number of counters */
+ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
+ set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
+ }
/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
@@ -193,7 +198,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
-static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
{
/* Intel PIII Tualatin. This comes in two flavours.
* One has 256kb of cache, the other 512. We have no way
@@ -258,7 +263,6 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
},
},
.c_init = init_intel,
- .c_identify = generic_identify,
.c_size_cache = intel_size_cache,
};
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index c8547a6fa7e6..5c43be47587f 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -4,6 +4,7 @@
* Changes:
* Venkatesh Pallipadi : Adding cache identification through cpuid(4)
* Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
+ * Andi Kleen : CPUID4 emulation on AMD.
*/
#include <linux/init.h>
@@ -130,25 +131,111 @@ struct _cpuid4_info {
cpumask_t shared_cpu_map;
};
-static unsigned short num_cache_leaves;
+unsigned short num_cache_leaves;
+
+/* AMD doesn't have CPUID4. Emulate it here to report the same
+ information to the user. This makes some assumptions about the machine:
+ No L3, L2 not shared, no SMT etc. that is currently true on AMD CPUs.
+
+ In theory the TLBs could be reported as fake type (they are in "dummy").
+ Maybe later */
+union l1_cache {
+ struct {
+ unsigned line_size : 8;
+ unsigned lines_per_tag : 8;
+ unsigned assoc : 8;
+ unsigned size_in_kb : 8;
+ };
+ unsigned val;
+};
+
+union l2_cache {
+ struct {
+ unsigned line_size : 8;
+ unsigned lines_per_tag : 4;
+ unsigned assoc : 4;
+ unsigned size_in_kb : 16;
+ };
+ unsigned val;
+};
+
+static const unsigned short assocs[] = {
+ [1] = 1, [2] = 2, [4] = 4, [6] = 8,
+ [8] = 16,
+ [0xf] = 0xffff // ??
+ };
+static const unsigned char levels[] = { 1, 1, 2 };
+static const unsigned char types[] = { 1, 2, 3 };
+
+static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+ union _cpuid4_leaf_ebx *ebx,
+ union _cpuid4_leaf_ecx *ecx)
+{
+ unsigned dummy;
+ unsigned line_size, lines_per_tag, assoc, size_in_kb;
+ union l1_cache l1i, l1d;
+ union l2_cache l2;
+
+ eax->full = 0;
+ ebx->full = 0;
+ ecx->full = 0;
+
+ cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val);
+ cpuid(0x80000006, &dummy, &dummy, &l2.val, &dummy);
+
+ if (leaf > 2 || !l1d.val || !l1i.val || !l2.val)
+ return;
+
+ eax->split.is_self_initializing = 1;
+ eax->split.type = types[leaf];
+ eax->split.level = levels[leaf];
+ eax->split.num_threads_sharing = 0;
+ eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+
+ if (leaf <= 1) {
+ union l1_cache *l1 = leaf == 0 ? &l1d : &l1i;
+ assoc = l1->assoc;
+ line_size = l1->line_size;
+ lines_per_tag = l1->lines_per_tag;
+ size_in_kb = l1->size_in_kb;
+ } else {
+ assoc = l2.assoc;
+ line_size = l2.line_size;
+ lines_per_tag = l2.lines_per_tag;
+ /* cpu_data has errata corrections for K7 applied */
+ size_in_kb = current_cpu_data.x86_cache_size;
+ }
+
+ if (assoc == 0xf)
+ eax->split.is_fully_associative = 1;
+ ebx->split.coherency_line_size = line_size - 1;
+ ebx->split.ways_of_associativity = assocs[assoc] - 1;
+ ebx->split.physical_line_partition = lines_per_tag - 1;
+ ecx->split.number_of_sets = (size_in_kb * 1024) / line_size /
+ (ebx->split.ways_of_associativity + 1) - 1;
+}
static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
{
- unsigned int eax, ebx, ecx, edx;
- union _cpuid4_leaf_eax cache_eax;
+ union _cpuid4_leaf_eax eax;
+ union _cpuid4_leaf_ebx ebx;
+ union _cpuid4_leaf_ecx ecx;
+ unsigned edx;
- cpuid_count(4, index, &eax, &ebx, &ecx, &edx);
- cache_eax.full = eax;
- if (cache_eax.split.type == CACHE_TYPE_NULL)
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ amd_cpuid4(index, &eax, &ebx, &ecx);
+ else
+ cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+ if (eax.split.type == CACHE_TYPE_NULL)
return -EIO; /* better error ? */
- this_leaf->eax.full = eax;
- this_leaf->ebx.full = ebx;
- this_leaf->ecx.full = ecx;
- this_leaf->size = (this_leaf->ecx.split.number_of_sets + 1) *
- (this_leaf->ebx.split.coherency_line_size + 1) *
- (this_leaf->ebx.split.physical_line_partition + 1) *
- (this_leaf->ebx.split.ways_of_associativity + 1);
+ this_leaf->eax = eax;
+ this_leaf->ebx = ebx;
+ this_leaf->ecx = ecx;
+ this_leaf->size = (ecx.split.number_of_sets + 1) *
+ (ebx.split.coherency_line_size + 1) *
+ (ebx.split.physical_line_partition + 1) *
+ (ebx.split.ways_of_associativity + 1);
return 0;
}
@@ -174,7 +261,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
#endif
@@ -296,14 +383,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
cpu_llc_id[cpu] = l2_id;
#endif
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_X86_HT
cpu_llc_id[cpu] = l3_id;
#endif
}
@@ -642,7 +729,7 @@ static void __cpuexit cache_remove_dev(struct sys_device * sys_dev)
return;
}
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
+static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
@@ -660,7 +747,7 @@ static int cacheinfo_cpu_callback(struct notifier_block *nfb,
return NOTIFY_OK;
}
-static struct notifier_block cacheinfo_cpu_notifier =
+static struct notifier_block __cpuinitdata cacheinfo_cpu_notifier =
{
.notifier_call = cacheinfo_cpu_callback,
};
@@ -672,7 +759,7 @@ static int __cpuinit cache_sysfs_init(void)
if (num_cache_leaves == 0)
return 0;
- register_cpu_notifier(&cacheinfo_cpu_notifier);
+ register_hotcpu_notifier(&cacheinfo_cpu_notifier);
for_each_online_cpu(i) {
cacheinfo_cpu_callback(&cacheinfo_cpu_notifier, CPU_ONLINE,
diff --git a/arch/i386/kernel/cpu/mcheck/Makefile b/arch/i386/kernel/cpu/mcheck/Makefile
index 30808f3d6715..f1ebe1c1c17a 100644
--- a/arch/i386/kernel/cpu/mcheck/Makefile
+++ b/arch/i386/kernel/cpu/mcheck/Makefile
@@ -1,2 +1,2 @@
-obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
+obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index fc5d5215e23d..b0862af595aa 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/config.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index afa0888f9a1e..d555bec0db99 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -6,7 +6,6 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/thread_info.h>
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index dc2416dfef15..84fd4cf7d0fb 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -9,6 +9,6 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
/* Call the installed machine check handler for this CPU setup. */
extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
-extern int mce_disabled __initdata;
+extern int mce_disabled;
extern int nr_mce_banks;
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 82dffe0d4954..1f9153ae5b03 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -11,7 +11,6 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
-#include <linux/config.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index fd2c459a31ef..504434a46011 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -5,7 +5,6 @@
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
-#include <linux/config.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
@@ -14,6 +13,8 @@
#include <asm/msr.h>
#include <asm/apic.h>
+#include <asm/therm_throt.h>
+
#include "mce.h"
/* as supported by the P4/Xeon family */
@@ -45,25 +46,12 @@ static void unexpected_thermal_interrupt(struct pt_regs *regs)
/* P4/Xeon Thermal transition interrupt handler */
static void intel_thermal_interrupt(struct pt_regs *regs)
{
- u32 l, h;
- unsigned int cpu = smp_processor_id();
- static unsigned long next[NR_CPUS];
+ __u64 msr_val;
ack_APIC_irq();
- if (time_after(next[cpu], jiffies))
- return;
-
- next[cpu] = jiffies + HZ*5;
- rdmsr(MSR_IA32_THERM_STATUS, l, h);
- if (l & 0x1) {
- printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
- printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
- cpu);
- add_taint(TAINT_MACHINE_CHECK);
- } else {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
- }
+ rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ therm_throt_process(msr_val & 0x1);
}
/* Thermal interrupt handler for this CPU setup */
@@ -123,10 +111,13 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
rdmsr (MSR_IA32_MISC_ENABLE, l, h);
wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
-
+
l = apic_read (APIC_LVTTHMR);
apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
+
+ /* enable thermal throttle processing */
+ atomic_set(&therm_throt_en, 1);
return;
}
#endif /* CONFIG_X86_MCE_P4THERMAL */
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
new file mode 100644
index 000000000000..4f43047de406
--- /dev/null
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -0,0 +1,180 @@
+/*
+ * linux/arch/i386/kerne/cpu/mcheck/therm_throt.c
+ *
+ * Thermal throttle event support code (such as syslog messaging and rate
+ * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
+ * This allows consistent reporting of CPU thermal throttle events.
+ *
+ * Maintains a counter in /sys that keeps track of the number of thermal
+ * events, such that the user knows how bad the thermal problem might be
+ * (since the logging to syslog and mcelog is rate limited).
+ *
+ * Author: Dmitriy Zavin (dmitriyz@google.com)
+ *
+ * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
+ * Inspired by Ross Biro's and Al Borchers' counter code.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
+#include <asm/cpu.h>
+#include <linux/notifier.h>
+#include <asm/therm_throt.h>
+
+/* How long to wait between reporting thermal events */
+#define CHECK_INTERVAL (300 * HZ)
+
+static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, thermal_throttle_count);
+atomic_t therm_throt_en = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+#define define_therm_throt_sysdev_one_ro(_name) \
+ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+
+#define define_therm_throt_sysdev_show_func(name) \
+static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
+ char *buf) \
+{ \
+ unsigned int cpu = dev->id; \
+ ssize_t ret; \
+ \
+ preempt_disable(); /* CPU hotplug */ \
+ if (cpu_online(cpu)) \
+ ret = sprintf(buf, "%lu\n", \
+ per_cpu(thermal_throttle_##name, cpu)); \
+ else \
+ ret = 0; \
+ preempt_enable(); \
+ \
+ return ret; \
+}
+
+define_therm_throt_sysdev_show_func(count);
+define_therm_throt_sysdev_one_ro(count);
+
+static struct attribute *thermal_throttle_attrs[] = {
+ &attr_count.attr,
+ NULL
+};
+
+static struct attribute_group thermal_throttle_attr_group = {
+ .attrs = thermal_throttle_attrs,
+ .name = "thermal_throttle"
+};
+#endif /* CONFIG_SYSFS */
+
+/***
+ * therm_throt_process - Process thermal throttling event from interrupt
+ * @curr: Whether the condition is current or not (boolean), since the
+ * thermal interrupt normally gets called both when the thermal
+ * event begins and once the event has ended.
+ *
+ * This function is called by the thermal interrupt after the
+ * IRQ has been acknowledged.
+ *
+ * It will take care of rate limiting and printing messages to the syslog.
+ *
+ * Returns: 0 : Event should NOT be further logged, i.e. still in
+ * "timeout" from previous log message.
+ * 1 : Event should be logged further, and a message has been
+ * printed to the syslog.
+ */
+int therm_throt_process(int curr)
+{
+ unsigned int cpu = smp_processor_id();
+ __u64 tmp_jiffs = get_jiffies_64();
+
+ if (curr)
+ __get_cpu_var(thermal_throttle_count)++;
+
+ if (time_before64(tmp_jiffs, __get_cpu_var(next_check)))
+ return 0;
+
+ __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL;
+
+ /* if we just entered the thermal event */
+ if (curr) {
+ printk(KERN_CRIT "CPU%d: Temperature above threshold, "
+ "cpu clock throttled (total events = %lu)\n", cpu,
+ __get_cpu_var(thermal_throttle_count));
+
+ add_taint(TAINT_MACHINE_CHECK);
+ } else {
+ printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu);
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_SYSFS
+/* Add/Remove thermal_throttle interface for CPU device */
+static __cpuinit int thermal_throttle_add_dev(struct sys_device * sys_dev)
+{
+ sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int thermal_throttle_remove_dev(struct sys_device * sys_dev)
+{
+ sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ return 0;
+}
+
+/* Mutex protecting device creation against CPU hotplug */
+static DEFINE_MUTEX(therm_cpu_lock);
+
+/* Get notified when a cpu comes on/off. Be hotplug friendly. */
+static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct sys_device *sys_dev;
+
+ sys_dev = get_cpu_sysdev(cpu);
+ mutex_lock(&therm_cpu_lock);
+ switch (action) {
+ case CPU_ONLINE:
+ thermal_throttle_add_dev(sys_dev);
+ break;
+ case CPU_DEAD:
+ thermal_throttle_remove_dev(sys_dev);
+ break;
+ }
+ mutex_unlock(&therm_cpu_lock);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block thermal_throttle_cpu_notifier =
+{
+ .notifier_call = thermal_throttle_cpu_callback,
+};
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static __init int thermal_throttle_init_device(void)
+{
+ unsigned int cpu = 0;
+
+ if (!atomic_read(&therm_throt_en))
+ return 0;
+
+ register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_lock(&therm_cpu_lock);
+#endif
+ /* connect live CPUs to sysfs */
+ for_each_online_cpu(cpu)
+ thermal_throttle_add_dev(get_cpu_sysdev(cpu));
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_unlock(&therm_cpu_lock);
+#endif
+
+ return 0;
+}
+
+device_initcall(thermal_throttle_init_device);
+#endif /* CONFIG_SYSFS */
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 169ac8e0db68..0b61eed8bbd8 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -243,7 +243,7 @@ static DEFINE_SPINLOCK(set_atomicity_lock);
* has been called.
*/
-static void prepare_set(void)
+static void prepare_set(void) __acquires(set_atomicity_lock)
{
unsigned long cr0;
@@ -274,7 +274,7 @@ static void prepare_set(void)
mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi);
}
-static void post_set(void)
+static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
__flush_tlb();
diff --git a/arch/i386/kernel/cpu/nexgen.c b/arch/i386/kernel/cpu/nexgen.c
index ad87fa58058d..8bf23cc80c63 100644
--- a/arch/i386/kernel/cpu/nexgen.c
+++ b/arch/i386/kernel/cpu/nexgen.c
@@ -10,7 +10,7 @@
* to have CPUID. (Thanks to Herbert Oppmann)
*/
-static int __init deep_magic_nexgen_probe(void)
+static int __cpuinit deep_magic_nexgen_probe(void)
{
int ret;
@@ -27,21 +27,20 @@ static int __init deep_magic_nexgen_probe(void)
return ret;
}
-static void __init init_nexgen(struct cpuinfo_x86 * c)
+static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
{
c->x86_cache_size = 256; /* A few had 1 MB... */
}
-static void __init nexgen_identify(struct cpuinfo_x86 * c)
+static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
{
/* Detect NexGen with old hypercode */
if ( deep_magic_nexgen_probe() ) {
strcpy(c->x86_vendor_id, "NexGenDriven");
}
- generic_identify(c);
}
-static struct cpu_dev nexgen_cpu_dev __initdata = {
+static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
.c_vendor = "Nexgen",
.c_ident = { "NexGenDriven" },
.c_models = {
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index a19fcb262dbb..76aac088a323 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -18,7 +18,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
* applications want to get the raw CPUID data, they should access
* /dev/cpu/<cpu_nr>/cpuid instead.
*/
- static char *x86_cap_flags[] = {
+ static const char * const x86_cap_flags[] = {
/* Intel-defined */
"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
@@ -46,8 +46,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
/* Intel-defined (#2) */
"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
- "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL,
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+ "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
+ NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
/* VIA/Cyrix/Centaur-defined */
@@ -62,7 +62,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
};
- static char *x86_power_flags[] = {
+ static const char * const x86_power_flags[] = {
"ts", /* temperature sensor */
"fid", /* frequency id control */
"vid", /* voltage id control */
@@ -109,9 +109,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
#ifdef CONFIG_X86_HT
if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", phys_proc_id[n]);
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
seq_printf(m, "siblings\t: %d\n", cpus_weight(cpu_core_map[n]));
- seq_printf(m, "core id\t\t: %d\n", cpu_core_id[n]);
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
}
#endif
diff --git a/arch/i386/kernel/cpu/rise.c b/arch/i386/kernel/cpu/rise.c
index d08d5a2811c8..9317f7414989 100644
--- a/arch/i386/kernel/cpu/rise.c
+++ b/arch/i386/kernel/cpu/rise.c
@@ -5,7 +5,7 @@
#include "cpu.h"
-static void __init init_rise(struct cpuinfo_x86 *c)
+static void __cpuinit init_rise(struct cpuinfo_x86 *c)
{
printk("CPU: Rise iDragon");
if (c->x86_model > 2)
@@ -28,7 +28,7 @@ static void __init init_rise(struct cpuinfo_x86 *c)
set_bit(X86_FEATURE_CX8, c->x86_capability);
}
-static struct cpu_dev rise_cpu_dev __initdata = {
+static struct cpu_dev rise_cpu_dev __cpuinitdata = {
.c_vendor = "Rise",
.c_ident = { "RiseRiseRise" },
.c_models = {
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 7214c9b577ab..4056fb7d2cdf 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -5,7 +5,7 @@
#include <asm/msr.h>
#include "cpu.h"
-static void __init init_transmeta(struct cpuinfo_x86 *c)
+static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
{
unsigned int cap_mask, uk, max, dummy;
unsigned int cms_rev1, cms_rev2;
@@ -85,10 +85,9 @@ static void __init init_transmeta(struct cpuinfo_x86 *c)
#endif
}
-static void __init transmeta_identify(struct cpuinfo_x86 * c)
+static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
{
u32 xlvl;
- generic_identify(c);
/* Transmeta-defined flags: level 0x80860001 */
xlvl = cpuid_eax(0x80860000);
@@ -98,7 +97,7 @@ static void __init transmeta_identify(struct cpuinfo_x86 * c)
}
}
-static struct cpu_dev transmeta_cpu_dev __initdata = {
+static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
.c_vendor = "Transmeta",
.c_ident = { "GenuineTMx86", "TransmetaCPU" },
.c_init = init_transmeta,
diff --git a/arch/i386/kernel/cpu/umc.c b/arch/i386/kernel/cpu/umc.c
index 2cd988f6dc55..1bf3f87e9c5b 100644
--- a/arch/i386/kernel/cpu/umc.c
+++ b/arch/i386/kernel/cpu/umc.c
@@ -5,12 +5,8 @@
/* UMC chips appear to be only either 386 or 486, so no special init takes place.
*/
-static void __init init_umc(struct cpuinfo_x86 * c)
-{
-
-}
-static struct cpu_dev umc_cpu_dev __initdata = {
+static struct cpu_dev umc_cpu_dev __cpuinitdata = {
.c_vendor = "UMC",
.c_ident = { "UMC UMC UMC" },
.c_models = {
@@ -21,7 +17,6 @@ static struct cpu_dev umc_cpu_dev __initdata = {
}
},
},
- .c_init = init_umc,
};
int __init umc_init_cpu(void)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 1d9a4abcdfc7..fde8bea85cee 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -24,7 +24,6 @@
*/
#include <linux/module.h>
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/errno.h>
@@ -168,6 +167,7 @@ static int cpuid_class_device_create(int i)
return err;
}
+#ifdef CONFIG_HOTPLUG_CPU
static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
@@ -183,10 +183,11 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long ac
return NOTIFY_OK;
}
-static struct notifier_block cpuid_class_cpu_notifier =
+static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
{
.notifier_call = cpuid_class_cpu_callback,
};
+#endif /* !CONFIG_HOTPLUG_CPU */
static int __init cpuid_init(void)
{
@@ -209,7 +210,7 @@ static int __init cpuid_init(void)
if (err != 0)
goto out_class;
}
- register_cpu_notifier(&cpuid_class_cpu_notifier);
+ register_hotcpu_notifier(&cpuid_class_cpu_notifier);
err = 0;
goto out;
@@ -234,7 +235,7 @@ static void __exit cpuid_exit(void)
class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
class_destroy(cpuid_class);
unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
- unregister_cpu_notifier(&cpuid_class_cpu_notifier);
+ unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
}
module_init(cpuid_init);
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 2b0cfce24a61..67d297dc1003 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -22,6 +22,8 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/kdebug.h>
+
#include <mach_ipi.h>
@@ -90,19 +92,28 @@ static void crash_save_self(struct pt_regs *regs)
crash_save_this_cpu(regs, cpu);
}
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static atomic_t waiting_for_crash_ipi;
-static int crash_nmi_callback(struct pt_regs *regs, int cpu)
+static int crash_nmi_callback(struct notifier_block *self,
+ unsigned long val, void *data)
{
+ struct pt_regs *regs;
struct pt_regs fixed_regs;
+ int cpu;
+
+ if (val != DIE_NMI_IPI)
+ return NOTIFY_OK;
+
+ regs = ((struct die_args *)data)->regs;
+ cpu = raw_smp_processor_id();
/* Don't do anything if this handler is invoked on crashing cpu.
* Otherwise, system will completely hang. Crashing cpu can get
* an NMI if system was initially booted with nmi_watchdog parameter.
*/
if (cpu == crashing_cpu)
- return 1;
+ return NOTIFY_STOP;
local_irq_disable();
if (!user_mode_vm(regs)) {
@@ -114,28 +125,29 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
halt();
- for(;;);
+ for (;;)
+ cpu_relax();
return 1;
}
-/*
- * By using the NMI code instead of a vector we just sneak thru the
- * word generator coming out with just what we want. AND it does
- * not matter if clustered_apic_mode is set or not.
- */
static void smp_send_nmi_allbutself(void)
{
- send_IPI_allbutself(APIC_DM_NMI);
+ send_IPI_allbutself(NMI_VECTOR);
}
+static struct notifier_block crash_nmi_nb = {
+ .notifier_call = crash_nmi_callback,
+};
+
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
/* Would it be better to replace the trap vector here? */
- set_nmi_callback(crash_nmi_callback);
+ if (register_die_notifier(&crash_nmi_nb))
+ return; /* return what? */
/* Ensure the new callback function is set before sending
* out the NMI
*/
@@ -162,7 +174,7 @@ static void nmi_shootdown_cpus(void)
void machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
- * has paniced or is otherwise in a critical state.
+ * has panicked or is otherwise in a critical state.
* The minimum amount of code to allow a kexec'd kernel
* to run successfully needs to happen here.
*
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index 5edb1d379add..b4d14c2eb345 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -44,7 +44,8 @@ static void doublefault_fn(void)
}
}
- for (;;) /* nothing */;
+ for (;;)
+ cpu_relax();
}
struct tss_struct doublefault_tss __cacheline_aligned = {
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 9202b67c4b2e..f9436989473c 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -19,7 +19,6 @@
* Skip non-WB memory and ignore empty memory ranges.
*/
-#include <linux/config.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -66,7 +65,7 @@ static unsigned long efi_rt_eflags;
static DEFINE_SPINLOCK(efi_rt_lock);
static pgd_t efi_bak_pg_dir_pointer[2];
-static void efi_call_phys_prelog(void)
+static void efi_call_phys_prelog(void) __acquires(efi_rt_lock)
{
unsigned long cr4;
unsigned long temp;
@@ -110,7 +109,7 @@ static void efi_call_phys_prelog(void)
load_gdt(cpu_gdt_descr);
}
-static void efi_call_phys_epilog(void)
+static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
{
unsigned long cr4;
struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
@@ -601,8 +600,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1);
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0)
- printk(KERN_ERR PFX "Failed to allocate res %s : 0x%lx-0x%lx\n",
- res->name, res->start, res->end);
+ printk(KERN_ERR PFX "Failed to allocate res %s : "
+ "0x%llx-0x%llx\n", res->name,
+ (unsigned long long)res->start,
+ (unsigned long long)res->end);
/*
* We don't know which region contains kernel data so we try
* it repeatedly and let the resource manager test it.
diff --git a/arch/i386/kernel/efi_stub.S b/arch/i386/kernel/efi_stub.S
index 08c0312d9b6c..ef00bb77d7e4 100644
--- a/arch/i386/kernel/efi_stub.S
+++ b/arch/i386/kernel/efi_stub.S
@@ -5,10 +5,8 @@
* turned off.
*/
-#include <linux/config.h>
#include <linux/linkage.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
/*
* efi_call_phys(void *, ...) is a function with variable parameters.
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index cfc683f153b9..5a63d6fdb70e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -40,14 +40,15 @@
* "current" is in register %ebx during any slow entries.
*/
-#include <linux/config.h>
#include <linux/linkage.h>
#include <asm/thread_info.h>
+#include <asm/irqflags.h>
#include <asm/errno.h>
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page.h>
#include <asm/desc.h>
+#include <asm/dwarf2.h>
#include "irq_vectors.h"
#define nr_syscalls ((syscall_table_size)/4)
@@ -75,41 +76,99 @@ DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
+/* These are replaces for paravirtualization */
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+
#ifdef CONFIG_PREEMPT
-#define preempt_stop cli
+#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
#else
#define preempt_stop
#define resume_kernel restore_nocheck
#endif
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ testl $IF_MASK,EFLAGS(%esp) # interrupts off?
+ jz 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+#ifdef CONFIG_VM86
+#define resume_userspace_sig check_userspace
+#else
+#define resume_userspace_sig resume_userspace
+#endif
+
#define SAVE_ALL \
cld; \
pushl %es; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET es, 0;*/\
pushl %ds; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ /*CFI_REL_OFFSET ds, 0;*/\
pushl %eax; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET eax, 0;\
pushl %ebp; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ebp, 0;\
pushl %edi; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET edi, 0;\
pushl %esi; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET esi, 0;\
pushl %edx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET edx, 0;\
pushl %ecx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ecx, 0;\
pushl %ebx; \
+ CFI_ADJUST_CFA_OFFSET 4;\
+ CFI_REL_OFFSET ebx, 0;\
movl $(__USER_DS), %edx; \
movl %edx, %ds; \
movl %edx, %es;
#define RESTORE_INT_REGS \
popl %ebx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ebx;\
popl %ecx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ecx;\
popl %edx; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE edx;\
popl %esi; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE esi;\
popl %edi; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE edi;\
popl %ebp; \
- popl %eax
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE ebp;\
+ popl %eax; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ CFI_RESTORE eax
#define RESTORE_REGS \
RESTORE_INT_REGS; \
1: popl %ds; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE ds;*/\
2: popl %es; \
+ CFI_ADJUST_CFA_OFFSET -4;\
+ /*CFI_RESTORE es;*/\
.section .fixup,"ax"; \
3: movl $0,(%esp); \
jmp 1b; \
@@ -122,13 +181,50 @@ VM_MASK = 0x00020000
.long 2b,4b; \
.previous
+#define RING0_INT_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, 3*4;\
+ /*CFI_OFFSET cs, -2*4;*/\
+ CFI_OFFSET eip, -3*4
+
+#define RING0_EC_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, 4*4;\
+ /*CFI_OFFSET cs, -2*4;*/\
+ CFI_OFFSET eip, -3*4
+
+#define RING0_PTREGS_FRAME \
+ CFI_STARTPROC simple;\
+ CFI_SIGNAL_FRAME;\
+ CFI_DEF_CFA esp, OLDESP-EBX;\
+ /*CFI_OFFSET cs, CS-OLDESP;*/\
+ CFI_OFFSET eip, EIP-OLDESP;\
+ /*CFI_OFFSET es, ES-OLDESP;*/\
+ /*CFI_OFFSET ds, DS-OLDESP;*/\
+ CFI_OFFSET eax, EAX-OLDESP;\
+ CFI_OFFSET ebp, EBP-OLDESP;\
+ CFI_OFFSET edi, EDI-OLDESP;\
+ CFI_OFFSET esi, ESI-OLDESP;\
+ CFI_OFFSET edx, EDX-OLDESP;\
+ CFI_OFFSET ecx, ECX-OLDESP;\
+ CFI_OFFSET ebx, EBX-OLDESP
ENTRY(ret_from_fork)
+ CFI_STARTPROC
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
call schedule_tail
GET_THREAD_INFO(%ebp)
popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
+ pushl $0x0202 # Reset kernel eflags
+ CFI_ADJUST_CFA_OFFSET 4
+ popfl
+ CFI_ADJUST_CFA_OFFSET -4
jmp syscall_exit
+ CFI_ENDPROC
/*
* Return to user mode is not as complex as all this looks,
@@ -139,16 +235,19 @@ ENTRY(ret_from_fork)
# userspace resumption stub bypassing syscall exit tracing
ALIGN
+ RING0_PTREGS_FRAME
ret_from_exception:
preempt_stop
ret_from_intr:
GET_THREAD_INFO(%ebp)
+check_userspace:
movl EFLAGS(%esp), %eax # mix EFLAGS and CS
movb CS(%esp), %al
- testl $(VM_MASK | 3), %eax
- jz resume_kernel
+ andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -159,7 +258,7 @@ ENTRY(resume_userspace)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- cli
+ DISABLE_INTERRUPTS
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
@@ -171,20 +270,43 @@ need_resched:
call preempt_schedule_irq
jmp need_resched
#endif
+ CFI_ENDPROC
/* SYSENTER_RETURN points to after the "sysenter" instruction in
the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
# sysenter call handler stub
ENTRY(sysenter_entry)
+ CFI_STARTPROC simple
+ CFI_SIGNAL_FRAME
+ CFI_DEF_CFA esp, 0
+ CFI_REGISTER esp, ebp
movl TSS_sysenter_esp0(%esp),%esp
sysenter_past_esp:
- sti
+ /*
+ * No need to follow this irqs on/off section: the syscall
+ * disabled irqs and here we enable it straight after entry:
+ */
+ ENABLE_INTERRUPTS
pushl $(__USER_DS)
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ss, 0*/
pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esp, 0
pushfl
+ CFI_ADJUST_CFA_OFFSET 4
pushl $(__USER_CS)
- pushl $SYSENTER_RETURN
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET cs, 0*/
+ /*
+ * Push current_thread_info()->sysenter_return to the stack.
+ * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
+ * pushed above; +8 corresponds to copy_thread's esp0 setting.
+ */
+ pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eip, 0
/*
* Load the potential sixth argument from user stack.
@@ -199,6 +321,7 @@ sysenter_past_esp:
.previous
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
@@ -209,7 +332,8 @@ sysenter_past_esp:
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp)
- cli
+ DISABLE_INTERRUPTS
+ TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
jne syscall_exit_work
@@ -217,13 +341,16 @@ sysenter_past_esp:
movl EIP(%esp), %edx
movl OLDESP(%esp), %ecx
xorl %ebp,%ebp
- sti
- sysexit
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS_SYSEXIT
+ CFI_ENDPROC
# system call handler stub
ENTRY(system_call)
+ RING0_INT_FRAME # can't unwind into user space anyway
pushl %eax # save orig_eax
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
testl $TF_MASK,EFLAGS(%esp)
@@ -240,9 +367,10 @@ syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,EAX(%esp) # store the return value
syscall_exit:
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
+ TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
@@ -254,16 +382,21 @@ restore_all:
# See comments in process.c:copy_thread() for details.
movb OLDSS(%esp), %ah
movb CS(%esp), %al
- andl $(VM_MASK | (4 << 8) | 3), %eax
- cmpl $((4 << 8) | 3), %eax
+ andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
+ TRACE_IRQS_IRET
+restore_nocheck_notrace:
RESTORE_REGS
addl $4, %esp
-1: iret
+ CFI_ADJUST_CFA_OFFSET -4
+1: INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
- sti
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -273,6 +406,7 @@ iret_exc:
.long 1b,iret_exc
.previous
+ CFI_RESTORE_STATE
ldt_ss:
larl OLDSS(%esp), %eax
jnz restore_nocheck
@@ -285,29 +419,36 @@ ldt_ss:
* CPUs, which we can try to work around to make
* dosemu and wine happy. */
subl $8, %esp # reserve space for switch16 pointer
- cli
+ CFI_ADJUST_CFA_OFFSET 8
+ DISABLE_INTERRUPTS
+ TRACE_IRQS_OFF
movl %esp, %eax
/* Set up the 16bit stack frame with switch32 pointer on top,
* and a switch16 pointer on top of the current frame. */
call setup_x86_bogus_stack
+ CFI_ADJUST_CFA_OFFSET -8 # frame has moved
+ TRACE_IRQS_IRET
RESTORE_REGS
lss 20+4(%esp), %esp # switch to 16bit stack
-1: iret
+1: INTERRUPT_RETURN
.section __ex_table,"a"
.align 4
.long 1b,iret_exc
.previous
+ CFI_ENDPROC
# perform work that needs to be done immediately before resumption
ALIGN
+ RING0_PTREGS_FRAME # can't unwind into user space anyway
work_pending:
testb $_TIF_NEED_RESCHED, %cl
jz work_notifysig
work_resched:
call schedule
- cli # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
+ TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
# than syscall tracing?
@@ -323,18 +464,20 @@ work_notifysig: # deal with pending signals and
# vm86-space
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace
+ jmp resume_userspace_sig
ALIGN
work_notifysig_v86:
#ifdef CONFIG_VM86
pushl %ecx # save ti_flags for do_notify_resume
+ CFI_ADJUST_CFA_OFFSET 4
call save_v86_state # %eax contains pt_regs pointer
popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
movl %eax, %esp
xorl %edx, %edx
call do_notify_resume
- jmp resume_userspace
+ jmp resume_userspace_sig
#endif
# perform syscall exit tracing
@@ -357,25 +500,28 @@ syscall_trace_entry:
syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
- sti # could let do_syscall_trace() call
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
call do_syscall_trace
jmp resume_userspace
+ CFI_ENDPROC
- ALIGN
+ RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
pushl %eax # save orig_eax
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
movl $-EFAULT,EAX(%esp)
jmp resume_userspace
- ALIGN
syscall_badsys:
movl $-ENOSYS,EAX(%esp)
jmp resume_userspace
+ CFI_ENDPROC
#define FIXUP_ESPFIX_STACK \
movl %esp, %eax; \
@@ -387,16 +533,21 @@ syscall_badsys:
movl %eax, %esp;
#define UNWIND_ESPFIX_STACK \
pushl %eax; \
+ CFI_ADJUST_CFA_OFFSET 4; \
movl %ss, %eax; \
/* see if on 16bit stack */ \
cmpw $__ESPFIX_SS, %ax; \
- jne 28f; \
- movl $__KERNEL_DS, %edx; \
- movl %edx, %ds; \
- movl %edx, %es; \
+ je 28f; \
+27: popl %eax; \
+ CFI_ADJUST_CFA_OFFSET -4; \
+.section .fixup,"ax"; \
+28: movl $__KERNEL_DS, %eax; \
+ movl %eax, %ds; \
+ movl %eax, %es; \
/* switch to 32bit stack */ \
- FIXUP_ESPFIX_STACK \
-28: popl %eax;
+ FIXUP_ESPFIX_STACK; \
+ jmp 27b; \
+.previous
/*
* Build the entry stubs and pointer table with
@@ -408,9 +559,14 @@ ENTRY(interrupt)
vector=0
ENTRY(irq_entries_start)
+ RING0_INT_FRAME
.rept NR_IRQS
ALIGN
-1: pushl $vector-256
+ .if vector
+ CFI_ADJUST_CFA_OFFSET -4
+ .endif
+1: pushl $~(vector)
+ CFI_ADJUST_CFA_OFFSET 4
jmp common_interrupt
.data
.long 1b
@@ -418,68 +574,112 @@ ENTRY(irq_entries_start)
vector=vector+1
.endr
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
ALIGN
common_interrupt:
SAVE_ALL
+ TRACE_IRQS_OFF
movl %esp,%eax
call do_IRQ
jmp ret_from_intr
+ CFI_ENDPROC
#define BUILD_INTERRUPT(name, nr) \
ENTRY(name) \
- pushl $nr-256; \
- SAVE_ALL \
+ RING0_INT_FRAME; \
+ pushl $~(nr); \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ SAVE_ALL; \
+ TRACE_IRQS_OFF \
movl %esp,%eax; \
call smp_/**/name; \
- jmp ret_from_intr;
+ jmp ret_from_intr; \
+ CFI_ENDPROC
/* The include is where all of the SMP etc. interrupts come from */
#include "entry_arch.h"
-ENTRY(divide_error)
- pushl $0 # no error code
- pushl $do_divide_error
+KPROBE_ENTRY(page_fault)
+ RING0_EC_FRAME
+ pushl $do_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
pushl %ds
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET ds, 0*/
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET eax, 0
xorl %eax, %eax
pushl %ebp
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebp, 0
pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
decl %eax # eax = -1
pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
cld
pushl %es
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET es, 0*/
UNWIND_ESPFIX_STACK
popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_REGISTER es, ecx*/
movl ES(%esp), %edi # get the function address
movl ORIG_EAX(%esp), %edx # get the error code
movl %eax, ORIG_EAX(%esp)
movl %ecx, ES(%esp)
+ /*CFI_REL_OFFSET es, ES*/
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
movl %esp,%eax # pt_regs pointer
call *%edi
jmp ret_from_exception
+ CFI_ENDPROC
+KPROBE_END(page_fault)
ENTRY(coprocessor_error)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_coprocessor_error
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(simd_coprocessor_error)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_simd_coprocessor_error
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(device_not_available)
+ RING0_INT_FRAME
pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
- movl %cr0, %eax
+ GET_CR0_INTO_EAX
testl $0x4, %eax # EM (math emulation bit)
jne device_not_available_emulate
preempt_stop
@@ -487,9 +687,12 @@ ENTRY(device_not_available)
jmp ret_from_exception
device_not_available_emulate:
pushl $0 # temporary storage for ORIG_EIP
+ CFI_ADJUST_CFA_OFFSET 4
call math_emulate
addl $4, %esp
+ CFI_ADJUST_CFA_OFFSET -4
jmp ret_from_exception
+ CFI_ENDPROC
/*
* Debug traps and NMI can happen at the one SYSENTER instruction
@@ -509,22 +712,32 @@ device_not_available_emulate:
jne ok; \
label: \
movl TSS_sysenter_esp0+offset(%esp),%esp; \
+ CFI_DEF_CFA esp, 0; \
+ CFI_UNDEFINED eip; \
pushfl; \
+ CFI_ADJUST_CFA_OFFSET 4; \
pushl $__KERNEL_CS; \
- pushl $sysenter_past_esp
+ CFI_ADJUST_CFA_OFFSET 4; \
+ pushl $sysenter_past_esp; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET eip, 0
KPROBE_ENTRY(debug)
+ RING0_INT_FRAME
cmpl $sysenter_entry,(%esp)
jne debug_stack_correct
FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
debug_stack_correct:
pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # error code 0
movl %esp,%eax # pt_regs pointer
call do_debug
jmp ret_from_exception
- .previous .text
+ CFI_ENDPROC
+KPROBE_END(debug)
+
/*
* NMI is doubly nasty. It can happen _while_ we're handling
* a debug fault, and the debug fault hasn't yet been able to
@@ -533,15 +746,19 @@ debug_stack_correct:
* check whether we got an NMI on the debug path where the debug
* fault happened on the sysenter path.
*/
-ENTRY(nmi)
+KPROBE_ENTRY(nmi)
+ RING0_INT_FRAME
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
je nmi_16bit_stack
cmpl $sysenter_entry,(%esp)
je nmi_stack_fixup
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
movl %esp,%eax
/* Do not access memory above the end of our stack page,
* it might not exist.
@@ -549,21 +766,28 @@ ENTRY(nmi)
andl $(THREAD_SIZE-1),%eax
cmpl $(THREAD_SIZE-20),%eax
popl %eax
+ CFI_ADJUST_CFA_OFFSET -4
jae nmi_stack_correct
cmpl $sysenter_entry,12(%esp)
je nmi_debug_stack_check
nmi_stack_correct:
+ /* We have a RING0_INT_FRAME here */
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
- jmp restore_all
+ jmp restore_nocheck_notrace
+ CFI_ENDPROC
nmi_stack_fixup:
+ RING0_INT_FRAME
FIX_STACK(12,nmi_stack_correct, 1)
jmp nmi_stack_correct
+
nmi_debug_stack_check:
+ /* We have a RING0_INT_FRAME here */
cmpw $__KERNEL_CS,16(%esp)
jne nmi_stack_correct
cmpl $debug,(%esp)
@@ -574,94 +798,194 @@ nmi_debug_stack_check:
jmp nmi_stack_correct
nmi_16bit_stack:
- /* create the pointer to lss back */
+ /* We have a RING0_INT_FRAME here.
+ *
+ * create the pointer to lss back
+ */
pushl %ss
+ CFI_ADJUST_CFA_OFFSET 4
pushl %esp
+ CFI_ADJUST_CFA_OFFSET 4
movzwl %sp, %esp
addw $4, (%esp)
/* copy the iret frame of 12 bytes */
.rept 3
pushl 16(%esp)
+ CFI_ADJUST_CFA_OFFSET 4
.endr
pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
FIXUP_ESPFIX_STACK # %eax == %esp
+ CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
xorl %edx,%edx # zero error code
call do_nmi
RESTORE_REGS
lss 12+4(%esp), %esp # back to 16bit stack
-1: iret
+1: INTERRUPT_RETURN
+ CFI_ENDPROC
.section __ex_table,"a"
.align 4
.long 1b,iret_exc
.previous
+KPROBE_END(nmi)
KPROBE_ENTRY(int3)
+ RING0_INT_FRAME
pushl $-1 # mark this as an int
+ CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_int3
jmp ret_from_exception
- .previous .text
+ CFI_ENDPROC
+KPROBE_END(int3)
ENTRY(overflow)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_overflow
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(bounds)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_bounds
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(invalid_op)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_invalid_op
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(coprocessor_segment_overrun)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_coprocessor_segment_overrun
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(invalid_TSS)
+ RING0_EC_FRAME
pushl $do_invalid_TSS
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(segment_not_present)
+ RING0_EC_FRAME
pushl $do_segment_not_present
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
ENTRY(stack_segment)
+ RING0_EC_FRAME
pushl $do_stack_segment
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
KPROBE_ENTRY(general_protection)
+ RING0_EC_FRAME
pushl $do_general_protection
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
- .previous .text
+ CFI_ENDPROC
+KPROBE_END(general_protection)
ENTRY(alignment_check)
+ RING0_EC_FRAME
pushl $do_alignment_check
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
-KPROBE_ENTRY(page_fault)
- pushl $do_page_fault
+ENTRY(divide_error)
+ RING0_INT_FRAME
+ pushl $0 # no error code
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl $do_divide_error
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
- .previous .text
+ CFI_ENDPROC
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl machine_check_vector
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
#endif
ENTRY(spurious_interrupt_bug)
+ RING0_INT_FRAME
pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
pushl $do_spurious_interrupt_bug
+ CFI_ADJUST_CFA_OFFSET 4
jmp error_code
+ CFI_ENDPROC
+
+#ifdef CONFIG_STACK_UNWIND
+ENTRY(arch_unwind_init_running)
+ CFI_STARTPROC
+ movl 4(%esp), %edx
+ movl (%esp), %ecx
+ leal 4(%esp), %eax
+ movl %ebx, EBX(%edx)
+ xorl %ebx, %ebx
+ movl %ebx, ECX(%edx)
+ movl %ebx, EDX(%edx)
+ movl %esi, ESI(%edx)
+ movl %edi, EDI(%edx)
+ movl %ebp, EBP(%edx)
+ movl %ebx, EAX(%edx)
+ movl $__USER_DS, DS(%edx)
+ movl $__USER_DS, ES(%edx)
+ movl %ebx, ORIG_EAX(%edx)
+ movl %ecx, EIP(%edx)
+ movl 12(%esp), %ecx
+ movl $__KERNEL_CS, CS(%edx)
+ movl %ebx, EFLAGS(%edx)
+ movl %eax, OLDESP(%edx)
+ movl 8(%esp), %eax
+ movl %ecx, 8(%esp)
+ movl EBX(%edx), %ebx
+ movl $__KERNEL_DS, OLDSS(%edx)
+ jmpl *%eax
+ CFI_ENDPROC
+ENDPROC(arch_unwind_init_running)
+#endif
+
+ENTRY(kernel_thread_helper)
+ pushl $0 # fake return address for unwinder
+ CFI_STARTPROC
+ movl %edx,%eax
+ push %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ call *%ebx
+ push %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ call do_exit
+ CFI_ENDPROC
+ENDPROC(kernel_thread_helper)
.section .rodata,"a"
#include "syscall_table.S"
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 3debc2e26542..be9d883c62ce 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -8,7 +8,6 @@
*/
.text
-#include <linux/config.h>
#include <linux/threads.h>
#include <linux/linkage.h>
#include <asm/segment.h>
@@ -318,20 +317,14 @@ is386: movl $2,%ecx # set MP
movl %eax,%gs
lldt %ax
cld # gcc2 wants the direction flag cleared at all times
+ pushl %eax # fake return address
#ifdef CONFIG_SMP
movb ready, %cl
movb $1, ready
- cmpb $0,%cl
- je 1f # the first CPU calls start_kernel
- # all other CPUs call initialize_secondary
- call initialize_secondary
- jmp L6
-1:
+ cmpb $0,%cl # the first CPU calls start_kernel
+ jne initialize_secondary # all other CPUs call initialize_secondary
#endif /* CONFIG_SMP */
- call start_kernel
-L6:
- jmp L6 # main should never return here, but
- # just in case, we know what happens.
+ jmp start_kernel
/*
* We depend on ET to be correct. This checks for 287/387.
@@ -378,8 +371,65 @@ rp_sidt:
addl $8,%edi
dec %ecx
jne rp_sidt
+
+.macro set_early_handler handler,trapno
+ lea \handler,%edx
+ movl $(__KERNEL_CS << 16),%eax
+ movw %dx,%ax
+ movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
+ lea idt_table,%edi
+ movl %eax,8*\trapno(%edi)
+ movl %edx,8*\trapno+4(%edi)
+.endm
+
+ set_early_handler handler=early_divide_err,trapno=0
+ set_early_handler handler=early_illegal_opcode,trapno=6
+ set_early_handler handler=early_protection_fault,trapno=13
+ set_early_handler handler=early_page_fault,trapno=14
+
ret
+early_divide_err:
+ xor %edx,%edx
+ pushl $0 /* fake errcode */
+ jmp early_fault
+
+early_illegal_opcode:
+ movl $6,%edx
+ pushl $0 /* fake errcode */
+ jmp early_fault
+
+early_protection_fault:
+ movl $13,%edx
+ jmp early_fault
+
+early_page_fault:
+ movl $14,%edx
+ jmp early_fault
+
+early_fault:
+ cld
+#ifdef CONFIG_PRINTK
+ movl $(__KERNEL_DS),%eax
+ movl %eax,%ds
+ movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
+ movl %cr2,%eax
+ pushl %eax
+ pushl %edx /* trapno */
+ pushl $fault_msg
+#ifdef CONFIG_EARLY_PRINTK
+ call early_printk
+#else
+ call printk
+#endif
+#endif
+hlt_loop:
+ hlt
+ jmp hlt_loop
+
/* This is the default interrupt "handler" :-) */
ALIGN
ignore_int:
@@ -393,6 +443,9 @@ ignore_int:
movl $(__KERNEL_DS),%eax
movl %eax,%ds
movl %eax,%es
+ cmpl $2,early_recursion_flag
+ je hlt_loop
+ incl early_recursion_flag
pushl 16(%esp)
pushl 24(%esp)
pushl 32(%esp)
@@ -438,9 +491,16 @@ ENTRY(stack_start)
ready: .byte 0
+early_recursion_flag:
+ .long 0
+
int_msg:
.asciz "Unknown interrupt or fault at EIP %p %p %p\n"
+fault_msg:
+ .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n"
+ .asciz "Stack: %p %p %p %p %p %p %p %p\n"
+
/*
* The IDT and GDT 'descriptors' are a strange 48-bit object
* only used by the lidt and lgdt instructions. They are not
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
new file mode 100644
index 000000000000..17647a530b2f
--- /dev/null
+++ b/arch/i386/kernel/hpet.c
@@ -0,0 +1,67 @@
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+#define HPET_MASK CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT 22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC 1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+ return (cycle_t)readl(hpet_ptr);
+}
+
+static struct clocksource clocksource_hpet = {
+ .name = "hpet",
+ .rating = 250,
+ .read = read_hpet,
+ .mask = HPET_MASK,
+ .mult = 0, /* set below */
+ .shift = HPET_SHIFT,
+ .is_continuous = 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+ unsigned long hpet_period;
+ void __iomem* hpet_base;
+ u64 tmp;
+
+ if (!is_hpet_enabled())
+ return -ENODEV;
+
+ /* calculate the hpet address: */
+ hpet_base =
+ (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ hpet_ptr = hpet_base + HPET_COUNTER;
+
+ /* calculate the frequency: */
+ hpet_period = readl(hpet_base + HPET_PERIOD);
+
+ /*
+ * hpet period is in femto seconds per cycle
+ * so we need to convert this to ns/cyc units
+ * aproximated by mult/2^shift
+ *
+ * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+ * fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+ * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+ * (fsec/cyc << shift)/1000000 = mult
+ * (hpet_period << shift)/FSEC_PER_NSEC = mult
+ */
+ tmp = (u64)hpet_period << HPET_SHIFT;
+ do_div(tmp, FSEC_PER_NSEC);
+ clocksource_hpet.mult = (u32)tmp;
+
+ return clocksource_register(&clocksource_hpet);
+}
+
+module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 036a9857936f..e3d4b73bfdb0 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -1,4 +1,3 @@
-#include <linux/config.h>
#include <linux/module.h>
#include <asm/checksum.h>
#include <asm/desc.h>
diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c
index c4351972d9af..665847281ed2 100644
--- a/arch/i386/kernel/i387.c
+++ b/arch/i386/kernel/i387.c
@@ -8,7 +8,6 @@
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <asm/processor.h>
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
new file mode 100644
index 000000000000..477b24daff53
--- /dev/null
+++ b/arch/i386/kernel/i8253.c
@@ -0,0 +1,118 @@
+/*
+ * i8253.c 8253/PIT functions
+ *
+ */
+#include <linux/clocksource.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/smp.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+
+#include "io_ports.h"
+
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+void setup_pit_timer(void)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+ outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
+ udelay(10);
+ outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
+ udelay(10);
+ outb(LATCH >> 8 , PIT_CH0); /* MSB */
+ spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+ unsigned long flags;
+ int count;
+ u32 jifs;
+ static int old_count;
+ static u32 old_jifs;
+
+ spin_lock_irqsave(&i8253_lock, flags);
+ /*
+ * Although our caller may have the read side of xtime_lock,
+ * this is now a seqlock, and we are cheating in this routine
+ * by having side effects on state that we cannot undo if
+ * there is a collision on the seqlock and our caller has to
+ * retry. (Namely, old_jifs and old_count.) So we must treat
+ * jiffies as volatile despite the lock. We read jiffies
+ * before latching the timer count to guarantee that although
+ * the jiffies value might be older than the count (that is,
+ * the counter may underflow between the last point where
+ * jiffies was incremented and the point where we latch the
+ * count), it cannot be newer.
+ */
+ jifs = jiffies;
+ outb_p(0x00, PIT_MODE); /* latch the count ASAP */
+ count = inb_p(PIT_CH0); /* read the latched count */
+ count |= inb_p(PIT_CH0) << 8;
+
+ /* VIA686a test code... reset the latch if count > max + 1 */
+ if (count > LATCH) {
+ outb_p(0x34, PIT_MODE);
+ outb_p(LATCH & 0xff, PIT_CH0);
+ outb(LATCH >> 8, PIT_CH0);
+ count = LATCH - 1;
+ }
+
+ /*
+ * It's possible for count to appear to go the wrong way for a
+ * couple of reasons:
+ *
+ * 1. The timer counter underflows, but we haven't handled the
+ * resulting interrupt and incremented jiffies yet.
+ * 2. Hardware problem with the timer, not giving us continuous time,
+ * the counter does small "jumps" upwards on some Pentium systems,
+ * (see c't 95/10 page 335 for Neptun bug.)
+ *
+ * Previous attempts to handle these cases intelligently were
+ * buggy, so we just do the simple thing now.
+ */
+ if (count > old_count && jifs == old_jifs) {
+ count = old_count;
+ }
+ old_count = count;
+ old_jifs = jifs;
+
+ spin_unlock_irqrestore(&i8253_lock, flags);
+
+ count = (LATCH - 1) - count;
+
+ return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+ .name = "pit",
+ .rating = 110,
+ .read = pit_read,
+ .mask = CLOCKSOURCE_MASK(32),
+ .mult = 0,
+ .shift = 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+ if (num_possible_cpus() > 4) /* PIT does not scale! */
+ return 0;
+
+ clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+ return clocksource_register(&clocksource_pit);
+}
+module_init(init_pit_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index b7636b96e104..ea5f4e7958d8 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -1,4 +1,3 @@
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/signal.h>
#include <linux/sched.h>
@@ -46,6 +45,8 @@ static void end_8259A_irq (unsigned int irq)
#define shutdown_8259A_irq disable_8259A_irq
+static int i8259A_auto_eoi;
+
static void mask_and_ack_8259A(unsigned int);
unsigned int startup_8259A_irq(unsigned int irq)
@@ -132,7 +133,7 @@ void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
- irq_desc[irq].handler = &i8259A_irq_type;
+ irq_desc[irq].chip = &i8259A_irq_type;
enable_irq(irq);
}
@@ -175,7 +176,7 @@ static void mask_and_ack_8259A(unsigned int irq)
* Lightweight spurious IRQ detection. We do not want
* to overdo spurious IRQ handling - it's usually a sign
* of hardware problems, so we only do the checks we can
- * do without slowing down good hardware unnecesserily.
+ * do without slowing down good hardware unnecessarily.
*
* Note that IRQ7 and IRQ15 (the two spurious IRQs
* usually resulting from the 8259A-1|2 PICs) occur
@@ -254,7 +255,7 @@ static void save_ELCR(char *trigger)
static int i8259A_resume(struct sys_device *dev)
{
- init_8259A(0);
+ init_8259A(i8259A_auto_eoi);
restore_ELCR(irq_trigger);
return 0;
}
@@ -302,6 +303,8 @@ void init_8259A(int auto_eoi)
{
unsigned long flags;
+ i8259A_auto_eoi = auto_eoi;
+
spin_lock_irqsave(&i8259A_lock, flags);
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
@@ -386,12 +389,12 @@ void __init init_ISA_irqs (void)
/*
* 16 old-style INTA-cycle interrupts:
*/
- irq_desc[i].handler = &i8259A_irq_type;
+ irq_desc[i].chip = &i8259A_irq_type;
} else {
/*
* 'high' PCI IRQs filled in on demand
*/
- irq_desc[i].handler = &no_irq_type;
+ irq_desc[i].chip = &no_irq_type;
}
}
}
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index a62df3e764c5..fd0df75cfbda 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -25,7 +25,6 @@
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
-#include <linux/config.h>
#include <linux/smp_lock.h>
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
@@ -38,8 +37,10 @@
#include <asm/desc.h>
#include <asm/timer.h>
#include <asm/i8259.h>
+#include <asm/nmi.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#include "io_ports.h"
@@ -50,6 +51,7 @@ atomic_t irq_mis_count;
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_SPINLOCK(vector_lock);
int timer_over_8254 __initdata = 1;
@@ -64,7 +66,7 @@ int sis_apic_bug = -1;
*/
int nr_ioapic_registers[MAX_IO_APICS];
-int disable_timer_pin_1 __initdata;
+static int disable_timer_pin_1 __initdata;
/*
* Rough estimation of how many shared IRQs there are, can
@@ -92,6 +94,34 @@ int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
#define vector_to_irq(vector) (vector)
#endif
+
+union entry_union {
+ struct { u32 w1, w2; };
+ struct IO_APIC_route_entry entry;
+};
+
+static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
+{
+ union entry_union eu;
+ unsigned long flags;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return eu.entry;
+}
+
+static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
+{
+ unsigned long flags;
+ union entry_union eu;
+ eu.entry = e;
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
@@ -199,13 +229,9 @@ static void unmask_IO_APIC_irq (unsigned int irq)
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
return;
@@ -214,10 +240,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
*/
memset(&entry, 0, sizeof(entry));
entry.mask = 1;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
}
static void clear_IO_APIC (void)
@@ -579,7 +602,7 @@ static int balanced_irq(void *unused)
/* push everything to CPU 0 to give us a starting point. */
for (i = 0 ; i < NR_IRQS ; i++) {
- pending_irq_cpumask[i] = cpumask_of_cpu(0);
+ irq_desc[i].pending_mask = cpumask_of_cpu(0);
set_pending_irq(i, cpumask_of_cpu(0));
}
@@ -1161,10 +1184,17 @@ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
int assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+ unsigned long flags;
+ int vector;
- BUG_ON(irq >= NR_IRQ_VECTORS);
- if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
+ BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+
+ spin_lock_irqsave(&vector_lock, flags);
+
+ if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) {
+ spin_unlock_irqrestore(&vector_lock, flags);
return IO_APIC_VECTOR(irq);
+ }
next:
current_vector += 8;
if (current_vector == SYSCALL_VECTOR)
@@ -1172,16 +1202,21 @@ next:
if (current_vector >= FIRST_SYSTEM_VECTOR) {
offset++;
- if (!(offset%8))
+ if (!(offset%8)) {
+ spin_unlock_irqrestore(&vector_lock, flags);
return -ENOSPC;
+ }
current_vector = FIRST_DEVICE_VECTOR + offset;
}
- vector_irq[current_vector] = irq;
+ vector = current_vector;
+ vector_irq[vector] = irq;
if (irq != AUTO_ASSIGN)
- IO_APIC_VECTOR(irq) = current_vector;
+ IO_APIC_VECTOR(irq) = vector;
- return current_vector;
+ spin_unlock_irqrestore(&vector_lock, flags);
+
+ return vector;
}
static struct hw_interrupt_type ioapic_level_type;
@@ -1191,23 +1226,18 @@ static struct hw_interrupt_type ioapic_edge_type;
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
-static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
{
- if (use_pci_vector() && !platform_legacy_irq(irq)) {
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- irq_desc[vector].handler = &ioapic_level_type;
- else
- irq_desc[vector].handler = &ioapic_edge_type;
- set_intr_gate(vector, interrupt[vector]);
- } else {
- if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
- trigger == IOAPIC_LEVEL)
- irq_desc[irq].handler = &ioapic_level_type;
- else
- irq_desc[irq].handler = &ioapic_edge_type;
- set_intr_gate(vector, interrupt[irq]);
- }
+ unsigned idx;
+
+ idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq;
+
+ if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+ trigger == IOAPIC_LEVEL)
+ irq_desc[idx].chip = &ioapic_level_type;
+ else
+ irq_desc[idx].chip = &ioapic_edge_type;
+ set_intr_gate(vector, interrupt[idx]);
}
static void __init setup_IO_APIC_irqs(void)
@@ -1275,9 +1305,8 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
+ ioapic_write_entry(apic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -1293,7 +1322,6 @@ static void __init setup_IO_APIC_irqs(void)
static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry,0,sizeof(entry));
@@ -1318,15 +1346,12 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
* The timer IRQ doesn't have to know that behind the
* scene we have a 8259A-master in AEOI mode ...
*/
- irq_desc[0].handler = &ioapic_edge_type;
+ irq_desc[0].chip = &ioapic_edge_type;
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
enable_8259A_irq(0);
}
@@ -1436,10 +1461,7 @@ void __init print_IO_APIC(void)
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
- *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, i);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
@@ -1658,10 +1680,7 @@ static void __init enable_IO_APIC(void)
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- *(((int *)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry = ioapic_read_entry(apic, pin);
/* If the interrupt line is enabled and in ExtInt mode
@@ -1718,7 +1737,6 @@ void disable_IO_APIC(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
- unsigned long flags;
memset(&entry, 0, sizeof(entry));
entry.mask = 0; /* Enabled */
@@ -1735,12 +1753,7 @@ void disable_IO_APIC(void)
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic_i8259.apic, 0x11+2*ioapic_i8259.pin,
- *(((int *)&entry)+1));
- io_apic_write(ioapic_i8259.apic, 0x10+2*ioapic_i8259.pin,
- *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
@@ -2062,6 +2075,13 @@ static void set_ioapic_affinity_vector (unsigned int vector,
#endif
#endif
+static int ioapic_retrigger(unsigned int irq)
+{
+ send_IPI_self(IO_APIC_VECTOR(irq));
+
+ return 1;
+}
+
/*
* Level and edge triggered IO-APIC interrupts need different handling,
* so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -2081,6 +2101,7 @@ static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
#ifdef CONFIG_SMP
.set_affinity = set_ioapic_affinity,
#endif
+ .retrigger = ioapic_retrigger,
};
static struct hw_interrupt_type ioapic_level_type __read_mostly = {
@@ -2094,6 +2115,7 @@ static struct hw_interrupt_type ioapic_level_type __read_mostly = {
#ifdef CONFIG_SMP
.set_affinity = set_ioapic_affinity,
#endif
+ .retrigger = ioapic_retrigger,
};
static inline void init_IO_APIC_traps(void)
@@ -2128,7 +2150,7 @@ static inline void init_IO_APIC_traps(void)
make_8259A_irq(irq);
else
/* Strange. Oh, well.. */
- irq_desc[irq].handler = &no_irq_type;
+ irq_desc[irq].chip = &no_irq_type;
}
}
}
@@ -2196,17 +2218,13 @@ static inline void unlock_ExtINT_logic(void)
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
- unsigned long flags;
pin = find_isa_irq_pin(8, mp_INT);
apic = find_isa_irq_apic(8, mp_INT);
if (pin == -1)
return;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -2219,10 +2237,7 @@ static inline void unlock_ExtINT_logic(void)
entry1.trigger = 0;
entry1.vector = 0;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry1);
save_control = CMOS_READ(RTC_CONTROL);
save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -2241,10 +2256,7 @@ static inline void unlock_ExtINT_logic(void)
CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
clear_IO_APIC_pin(apic, pin);
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry0);
}
int timer_uses_ioapic_pin_0;
@@ -2344,7 +2356,7 @@ static inline void check_timer(void)
printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
disable_8259A_irq(0);
- irq_desc[0].handler = &lapic_irq_type;
+ irq_desc[0].chip = &lapic_irq_type;
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
@@ -2444,17 +2456,12 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
- unsigned long flags;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
- spin_lock_irqsave(&ioapic_lock, flags);
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
- *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
- }
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ entry[i] = ioapic_read_entry(dev->id, i);
return 0;
}
@@ -2476,11 +2483,9 @@ static int ioapic_resume(struct sys_device *dev)
reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
- for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
- io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
- io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
- }
spin_unlock_irqrestore(&ioapic_lock, flags);
+ for (i = 0; i < nr_ioapic_registers[dev->id]; i ++)
+ ioapic_write_entry(dev->id, i, entry[i]);
return 0;
}
@@ -2677,9 +2682,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
+ ioapic_write_entry(ioapic, pin, entry);
spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2687,3 +2691,25 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
}
#endif /* CONFIG_ACPI */
+
+static int __init parse_disable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = 1;
+ return 0;
+}
+early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
+
+static int __init parse_enable_timer_pin_1(char *arg)
+{
+ disable_timer_pin_1 = -1;
+ return 0;
+}
+early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
+
+static int __init parse_noapic(char *arg)
+{
+ /* disable IO-APIC */
+ disable_ioapic_setup();
+ return 0;
+}
+early_param("noapic", parse_noapic);
diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 79026f026b85..498e8bc197d5 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -79,6 +79,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
+ set_thread_flag(TIF_IO_BITMAP);
}
/*
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 248e922ee13a..5fe547cd8f9f 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -53,13 +53,19 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
*/
fastcall unsigned int do_IRQ(struct pt_regs *regs)
{
- /* high bits used in ret_from_ code */
- int irq = regs->orig_eax & 0xff;
+ /* high bit used in ret_from_ code */
+ int irq = ~regs->orig_eax;
#ifdef CONFIG_4KSTACKS
union irq_ctx *curctx, *irqctx;
u32 *isp;
#endif
+ if (unlikely((unsigned)irq >= NR_IRQS)) {
+ printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
+ __FUNCTION__, irq);
+ BUG();
+ }
+
irq_enter();
#ifdef CONFIG_DEBUG_STACKOVERFLOW
/* Debugging check for stack overflow: is there less than 1KB free? */
@@ -95,6 +101,14 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
irqctx->tinfo.task = curctx->tinfo.task;
irqctx->tinfo.previous_esp = current_stack_pointer;
+ /*
+ * Copy the softirq bits in preempt_count so that the
+ * softirq checks work in the hardirq context.
+ */
+ irqctx->tinfo.preempt_count =
+ (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
+ (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
+
asm volatile(
" xchgl %%ebx,%%esp \n"
" call __do_IRQ \n"
@@ -147,7 +161,7 @@ void irq_ctx_init(int cpu)
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
+ irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx;
@@ -192,6 +206,10 @@ asmlinkage void do_softirq(void)
: "0"(isp)
: "memory", "cc", "edx", "ecx", "eax"
);
+ /*
+ * Shouldnt happen, we returned above if in_interrupt():
+ */
+ WARN_ON_ONCE(softirq_count());
}
local_irq_restore(flags);
@@ -219,7 +237,7 @@ int show_interrupts(struct seq_file *p, void *v)
if (i == 0) {
seq_printf(p, " ");
for_each_online_cpu(j)
- seq_printf(p, "CPU%d ",j);
+ seq_printf(p, "CPU%-8d",j);
seq_putc(p, '\n');
}
@@ -235,7 +253,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
#endif
- seq_printf(p, " %14s", irq_desc[i].handler->typename);
+ seq_printf(p, " %14s", irq_desc[i].chip->typename);
seq_printf(p, " %s", action->name);
for (action=action->next; action; action = action->next)
@@ -277,13 +295,13 @@ void fixup_irqs(cpumask_t map)
if (irq == 2)
continue;
- cpus_and(mask, irq_affinity[irq], map);
+ cpus_and(mask, irq_desc[irq].affinity, map);
if (any_online_cpu(mask) == NR_CPUS) {
printk("Breaking affinity for irq %i\n", irq);
mask = map;
}
- if (irq_desc[irq].handler->set_affinity)
- irq_desc[irq].handler->set_affinity(irq, mask);
+ if (irq_desc[irq].chip->set_affinity)
+ irq_desc[irq].chip->set_affinity(irq, mask);
else if (irq_desc[irq].action && !(warned++))
printk("Cannot set affinity for irq %i\n", irq);
}
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 395a9a6dff88..afe6505ca0b3 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -28,7 +28,6 @@
* <prasanna@in.ibm.com> added function-return probes.
*/
-#include <linux/config.h>
#include <linux/kprobes.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
@@ -57,34 +56,85 @@ static __always_inline void set_jmp_op(void *from, void *to)
/*
* returns non-zero if opcodes can be boosted.
*/
-static __always_inline int can_boost(kprobe_opcode_t opcode)
+static __always_inline int can_boost(kprobe_opcode_t *opcodes)
{
- switch (opcode & 0xf0 ) {
+#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \
+ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
+ (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
+ (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
+ (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
+ << (row % 32))
+ /*
+ * Undefined/reserved opcodes, conditional jump, Opcode Extension
+ * Groups, and some special opcodes can not be boost.
+ */
+ static const unsigned long twobyte_is_boostable[256 / 32] = {
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ /* ------------------------------- */
+ W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */
+ W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */
+ W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */
+ W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */
+ W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */
+ W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */
+ W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */
+ W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */
+ W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */
+ W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */
+ W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */
+ W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */
+ W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */
+ W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */
+ W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */
+ W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */
+ /* ------------------------------- */
+ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
+ };
+#undef W
+ kprobe_opcode_t opcode;
+ kprobe_opcode_t *orig_opcodes = opcodes;
+retry:
+ if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+ return 0;
+ opcode = *(opcodes++);
+
+ /* 2nd-byte opcode */
+ if (opcode == 0x0f) {
+ if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1)
+ return 0;
+ return test_bit(*opcodes, twobyte_is_boostable);
+ }
+
+ switch (opcode & 0xf0) {
+ case 0x60:
+ if (0x63 < opcode && opcode < 0x67)
+ goto retry; /* prefixes */
+ /* can't boost Address-size override and bound */
+ return (opcode != 0x62 && opcode != 0x67);
case 0x70:
return 0; /* can't boost conditional jump */
- case 0x90:
- /* can't boost call and pushf */
- return opcode != 0x9a && opcode != 0x9c;
case 0xc0:
- /* can't boost undefined opcodes and soft-interruptions */
- return (0xc1 < opcode && opcode < 0xc6) ||
- (0xc7 < opcode && opcode < 0xcc) || opcode == 0xcf;
+ /* can't boost software-interruptions */
+ return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
case 0xd0:
/* can boost AA* and XLAT */
return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
case 0xe0:
- /* can boost in/out and (may be) jmps */
- return (0xe3 < opcode && opcode != 0xe8);
+ /* can boost in/out and absolute jmps */
+ return ((opcode & 0x04) || opcode == 0xea);
case 0xf0:
+ if ((opcode & 0x0c) == 0 && opcode != 0xf1)
+ goto retry; /* lock/rep(ne) prefix */
/* clear and set flags can be boost */
return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
default:
- /* currently, can't boost 2 bytes opcodes */
- return opcode != 0x0f;
+ if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e)
+ goto retry; /* prefixes */
+ /* can't boost CS override and call */
+ return (opcode != 0x2e && opcode != 0x9a);
}
}
-
/*
* returns non-zero if opcode modifies the interrupt flag.
*/
@@ -109,7 +159,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
p->opcode = *p->addr;
- if (can_boost(p->opcode)) {
+ if (can_boost(p->addr)) {
p->ainsn.boostable = 0;
} else {
p->ainsn.boostable = -1;
@@ -206,9 +256,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
int ret = 0;
kprobe_opcode_t *addr;
struct kprobe_ctlblk *kcb;
-#ifdef CONFIG_PREEMPT
- unsigned pre_preempt_count = preempt_count();
-#endif /* CONFIG_PREEMPT */
addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t));
@@ -285,22 +332,16 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
/* handler has already set things up, so skip ss setup */
return 1;
- if (p->ainsn.boostable == 1 &&
-#ifdef CONFIG_PREEMPT
- !(pre_preempt_count) && /*
- * This enables booster when the direct
- * execution path aren't preempted.
- */
-#endif /* CONFIG_PREEMPT */
- !p->post_handler && !p->break_handler ) {
+ss_probe:
+#ifndef CONFIG_PREEMPT
+ if (p->ainsn.boostable == 1 && !p->post_handler){
/* Boost up -- we can execute copied instructions directly */
reset_current_kprobe();
regs->eip = (unsigned long)p->ainsn.insn;
preempt_enable_no_resched();
return 1;
}
-
-ss_probe:
+#endif
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_HIT_SS;
return 1;
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index f73d7374a2ba..91966bafb3dc 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
+#include <linux/init.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -20,70 +21,13 @@
#include <asm/system.h>
#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-
-#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define L2_ATTR (_PAGE_PRESENT)
-
-#define LEVEL0_SIZE (1UL << 12UL)
-
-#ifndef CONFIG_X86_PAE
-#define LEVEL1_SIZE (1UL << 22UL)
-static u32 pgtable_level1[1024] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index;
- u32 *pgtable_level2;
-
- /* Find the current page table */
- pgtable_level2 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = address / LEVEL1_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level2);
-}
-
-#else
-#define LEVEL1_SIZE (1UL << 21UL)
-#define LEVEL2_SIZE (1UL << 30UL)
-static u64 pgtable_level1[512] PAGE_ALIGNED;
-static u64 pgtable_level2[512] PAGE_ALIGNED;
-
-static void identity_map_page(unsigned long address)
-{
- unsigned long level1_index, level2_index, level3_index;
- u64 *pgtable_level3;
-
- /* Find the current page table */
- pgtable_level3 = __va(read_cr3());
-
- /* Find the indexes of the physical address to identity map */
- level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE;
- level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE;
- level3_index = address / LEVEL2_SIZE;
-
- /* Identity map the page table entry */
- pgtable_level1[level1_index] = address | L0_ATTR;
- pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
- set_64bit(&pgtable_level3[level3_index],
- __pa(pgtable_level2) | L2_ATTR);
-
- /* Flush the tlb so the new mapping takes effect.
- * Global tlb entries are not flushed but that is not an issue.
- */
- load_cr3(pgtable_level3);
-}
+static u32 kexec_pgd[1024] PAGE_ALIGNED;
+#ifdef CONFIG_X86_PAE
+static u32 kexec_pmd0[1024] PAGE_ALIGNED;
+static u32 kexec_pmd1[1024] PAGE_ALIGNED;
#endif
+static u32 kexec_pte0[1024] PAGE_ALIGNED;
+static u32 kexec_pte1[1024] PAGE_ALIGNED;
static void set_idt(void *newidt, __u16 limit)
{
@@ -127,16 +71,6 @@ static void load_segments(void)
#undef __STR
}
-typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
- unsigned long indirection_page,
- unsigned long reboot_code_buffer,
- unsigned long start_address,
- unsigned int has_pae) ATTRIB_NORET;
-
-const extern unsigned char relocate_new_kernel[];
-extern void relocate_new_kernel_end(void);
-const extern unsigned int relocate_new_kernel_size;
-
/*
* A architecture hook called to validate the
* proposed image and prepare the control pages
@@ -169,34 +103,35 @@ void machine_kexec_cleanup(struct kimage *image)
*/
NORET_TYPE void machine_kexec(struct kimage *image)
{
- unsigned long page_list;
- unsigned long reboot_code_buffer;
-
- relocate_new_kernel_t rnk;
+ unsigned long page_list[PAGES_NR];
+ void *control_page;
/* Interrupts aren't acceptable while we reboot */
local_irq_disable();
- /* Compute some offsets */
- reboot_code_buffer = page_to_pfn(image->control_code_page)
- << PAGE_SHIFT;
- page_list = image->head;
-
- /* Set up an identity mapping for the reboot_code_buffer */
- identity_map_page(reboot_code_buffer);
-
- /* copy it out */
- memcpy((void *)reboot_code_buffer, relocate_new_kernel,
- relocate_new_kernel_size);
-
- /* The segment registers are funny things, they are
- * automatically loaded from a table, in memory wherever you
- * set them to a specific selector, but this table is never
- * accessed again you set the segment to a different selector.
- *
- * The more common model is are caches where the behide
- * the scenes work is done, but is also dropped at arbitrary
- * times.
+ control_page = page_address(image->control_code_page);
+ memcpy(control_page, relocate_kernel, PAGE_SIZE);
+
+ page_list[PA_CONTROL_PAGE] = __pa(control_page);
+ page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+ page_list[PA_PGD] = __pa(kexec_pgd);
+ page_list[VA_PGD] = (unsigned long)kexec_pgd;
+#ifdef CONFIG_X86_PAE
+ page_list[PA_PMD_0] = __pa(kexec_pmd0);
+ page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
+ page_list[PA_PMD_1] = __pa(kexec_pmd1);
+ page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
+#endif
+ page_list[PA_PTE_0] = __pa(kexec_pte0);
+ page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
+ page_list[PA_PTE_1] = __pa(kexec_pte1);
+ page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+
+ /* The segment registers are funny things, they have both a
+ * visible and an invisible part. Whenever the visible part is
+ * set to a specific selector, the invisible part is loaded
+ * with from a table in memory. At no other time is the
+ * descriptor table in memory accessed.
*
* I take advantage of this here by force loading the
* segments, before I zap the gdt with an invalid value.
@@ -209,6 +144,28 @@ NORET_TYPE void machine_kexec(struct kimage *image)
set_idt(phys_to_virt(0),0);
/* now call it */
- rnk = (relocate_new_kernel_t) reboot_code_buffer;
- (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae);
+ relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
+ image->start, cpu_has_pae);
+}
+
+/* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never sets it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+static int __init parse_crashkernel(char *arg)
+{
+ unsigned long size, base;
+ size = memparse(arg, &arg);
+ if (*arg == '@') {
+ base = memparse(arg+1, &arg);
+ /* FIXME: Do I want a sanity check
+ * to validate the memory range?
+ */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ return 0;
}
+early_param("crashkernel", parse_crashkernel);
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index 558bb207720f..eb57a851789d 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -42,11 +42,11 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mca.h>
+#include <linux/kprobes.h>
#include <asm/system.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/mman.h>
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/ioport.h>
@@ -415,7 +415,8 @@ subsys_initcall(mca_init);
/*--------------------------------------------------------------------*/
-static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
+static __kprobes void
+mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
{
int slot = mca_dev->slot;
@@ -445,7 +446,7 @@ static void mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
/*--------------------------------------------------------------------*/
-static int mca_handle_nmi_callback(struct device *dev, void *data)
+static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
{
struct mca_device *mca_dev = to_mca_device(dev);
unsigned char pos5;
@@ -463,7 +464,7 @@ static int mca_handle_nmi_callback(struct device *dev, void *data)
return 0;
}
-void mca_handle_nmi(void)
+void __kprobes mca_handle_nmi(void)
{
/* First try - scan the various adapters and see if a specific
* adapter was responsible for the error.
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 0a865889b2a9..9b9479768d5e 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -2,6 +2,7 @@
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2004 Tigran Aivazian
+ * 2006 Shaohua Li <shaohua.li@intel.com>
*
* This driver allows to upgrade microcode on Intel processors
* belonging to IA-32 family - PentiumPro, Pentium II,
@@ -82,6 +83,9 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
#include <asm/msr.h>
#include <asm/uaccess.h>
@@ -91,9 +95,6 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>");
MODULE_LICENSE("GPL");
-static int verbose;
-module_param(verbose, int, 0644);
-
#define MICROCODE_VERSION "1.14a"
#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
@@ -120,55 +121,40 @@ static DEFINE_SPINLOCK(microcode_update_lock);
/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
static DEFINE_MUTEX(microcode_mutex);
-static void __user *user_buffer; /* user area microcode data buffer */
-static unsigned int user_buffer_size; /* it's size */
-
-typedef enum mc_error_code {
- MC_SUCCESS = 0,
- MC_IGNORED = 1,
- MC_NOTFOUND = 2,
- MC_MARKED = 3,
- MC_ALLOCATED = 4,
-} mc_error_code_t;
-
static struct ucode_cpu_info {
+ int valid;
unsigned int sig;
- unsigned int pf, orig_pf;
+ unsigned int pf;
unsigned int rev;
- unsigned int cksum;
- mc_error_code_t err;
microcode_t *mc;
} ucode_cpu_info[NR_CPUS];
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-static void collect_cpu_info (void *unused)
+static void collect_cpu_info(int cpu_num)
{
- int cpu_num = smp_processor_id();
struct cpuinfo_x86 *c = cpu_data + cpu_num;
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
unsigned int val[2];
- uci->sig = uci->pf = uci->rev = uci->cksum = 0;
- uci->err = MC_NOTFOUND;
+ /* We should bind the task to the CPU */
+ BUG_ON(raw_smp_processor_id() != cpu_num);
+ uci->pf = uci->rev = 0;
uci->mc = NULL;
+ uci->valid = 1;
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
cpu_has(c, X86_FEATURE_IA64)) {
- printk(KERN_ERR "microcode: CPU%d not a capable Intel processor\n", cpu_num);
+ printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+ "processor\n", cpu_num);
+ uci->valid = 0;
return;
- } else {
- uci->sig = cpuid_eax(0x00000001);
+ }
- if ((c->x86_model >= 5) || (c->x86 > 6)) {
- /* get processor flags from MSR 0x17 */
- rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
- uci->pf = 1 << ((val[1] >> 18) & 7);
- }
- uci->orig_pf = uci->pf;
+ uci->sig = cpuid_eax(0x00000001);
+
+ if ((c->x86_model >= 5) || (c->x86 > 6)) {
+ /* get processor flags from MSR 0x17 */
+ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+ uci->pf = 1 << ((val[1] >> 18) & 7);
}
wrmsr(MSR_IA32_UCODE_REV, 0, 0);
@@ -180,218 +166,159 @@ static void collect_cpu_info (void *unused)
uci->sig, uci->pf, uci->rev);
}
-static inline void mark_microcode_update (int cpu_num, microcode_header_t *mc_header, int sig, int pf, int cksum)
+static inline int microcode_update_match(int cpu_num,
+ microcode_header_t *mc_header, int sig, int pf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- pr_debug("Microcode Found.\n");
- pr_debug(" Header Revision 0x%x\n", mc_header->hdrver);
- pr_debug(" Loader Revision 0x%x\n", mc_header->ldrver);
- pr_debug(" Revision 0x%x \n", mc_header->rev);
- pr_debug(" Date %x/%x/%x\n",
- ((mc_header->date >> 24 ) & 0xff),
- ((mc_header->date >> 16 ) & 0xff),
- (mc_header->date & 0xFFFF));
- pr_debug(" Signature 0x%x\n", sig);
- pr_debug(" Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
- ((sig >> 12) & 0x3),
- ((sig >> 8) & 0xf),
- ((sig >> 4) & 0xf),
- ((sig & 0xf)));
- pr_debug(" Processor Flags 0x%x\n", pf);
- pr_debug(" Checksum 0x%x\n", cksum);
-
- if (mc_header->rev < uci->rev) {
- if (uci->err == MC_NOTFOUND) {
- uci->err = MC_IGNORED;
- uci->cksum = mc_header->rev;
- } else if (uci->err == MC_IGNORED && uci->cksum < mc_header->rev)
- uci->cksum = mc_header->rev;
- } else if (mc_header->rev == uci->rev) {
- if (uci->err < MC_MARKED) {
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
- }
- } else if (uci->err != MC_ALLOCATED || mc_header->rev > uci->mc->hdr.rev) {
- pr_debug("microcode: CPU%d found a matching microcode update with "
- " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, uci->rev);
- uci->cksum = cksum;
- uci->pf = pf; /* keep the original mc pf for cksum calculation */
- uci->err = MC_MARKED; /* found the match */
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info + cpu_num != uci
- && ucode_cpu_info[cpu_num].mc == uci->mc) {
- uci->mc = NULL;
- break;
- }
- }
- if (uci->mc != NULL) {
- vfree(uci->mc);
- uci->mc = NULL;
- }
- }
- return;
+ if (!sigmatch(sig, uci->sig, pf, uci->pf)
+ || mc_header->rev <= uci->rev)
+ return 0;
+ return 1;
}
-static int find_matching_ucodes (void)
+static int microcode_sanity_check(void *mc)
{
- int cursor = 0;
- int error = 0;
-
- while (cursor + MC_HEADER_SIZE < user_buffer_size) {
- microcode_header_t mc_header;
- void *newmc = NULL;
- int i, sum, cpu_num, allocated_flag, total_size, data_size, ext_table_size;
+ microcode_header_t *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ struct extended_signature *ext_sig;
+ unsigned long total_size, data_size, ext_table_size;
+ int sum, orig_sum, ext_sigcount = 0, i;
+
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad data size in microcode data file\n");
+ return -EINVAL;
+ }
- if (copy_from_user(&mc_header, user_buffer + cursor, MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ printk(KERN_ERR "microcode: error! "
+ "Unknown microcode update format\n");
+ return -EINVAL;
+ }
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ printk(KERN_ERR "microcode: error! "
+ "Small exttable size in microcode data file\n");
+ return -EINVAL;
}
-
- total_size = get_totalsize(&mc_header);
- if ((cursor + total_size > user_buffer_size) || (total_size < DEFAULT_UCODE_TOTALSIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ printk(KERN_ERR "microcode: error! "
+ "Bad exttable size in microcode data file\n");
+ return -EFAULT;
}
+ ext_sigcount = ext_header->count;
+ }
- data_size = get_datasize(&mc_header);
- if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < DEFAULT_UCODE_DATASIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
+ /* check extended table checksum */
+ if (ext_table_size) {
+ int ext_table_sum = 0;
+ int *ext_tablep = (int *)ext_header;
+
+ i = ext_table_size / DWSIZE;
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+ if (ext_table_sum) {
+ printk(KERN_WARNING "microcode: aborting, "
+ "bad extended signature table checksum\n");
+ return -EINVAL;
}
+ }
- if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
- printk(KERN_ERR "microcode: error! Unknown microcode update format\n");
- error = -EINVAL;
- goto out;
+ /* calculate the checksum */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+ while (i--)
+ orig_sum += ((int *)mc)[i];
+ if (orig_sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
+ }
+ if (!ext_table_size)
+ return 0;
+ /* check extended signature checksum */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (struct extended_signature *)((void *)ext_header
+ + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i);
+ sum = orig_sum
+ - (mc_header->sig + mc_header->pf + mc_header->cksum)
+ + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ printk(KERN_ERR "microcode: aborting, bad checksum\n");
+ return -EINVAL;
}
+ }
+ return 0;
+}
- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, uci->orig_pf))
- mark_microcode_update(cpu_num, &mc_header, mc_header.sig, mc_header.pf, mc_header.cksum);
- }
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ * return < 0 - error
+ */
+static int get_maching_microcode(void *mc, int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ microcode_header_t *mc_header = mc;
+ struct extended_sigtable *ext_header;
+ unsigned long total_size = get_totalsize(mc_header);
+ int ext_sigcount, i;
+ struct extended_signature *ext_sig;
+ void *new_mc;
+
+ if (microcode_update_match(cpu, mc_header,
+ mc_header->sig, mc_header->pf))
+ goto find;
+
+ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+ return 0;
+
+ ext_header = (struct extended_sigtable *)(mc +
+ get_datasize(mc_header) + MC_HEADER_SIZE);
+ ext_sigcount = ext_header->count;
+ ext_sig = (struct extended_signature *)((void *)ext_header
+ + EXT_HEADER_SIZE);
+ for (i = 0; i < ext_sigcount; i++) {
+ if (microcode_update_match(cpu, mc_header,
+ ext_sig->sig, ext_sig->pf))
+ goto find;
+ ext_sig++;
+ }
+ return 0;
+find:
+ pr_debug("microcode: CPU %d found a matching microcode update with"
+ " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
+ new_mc = vmalloc(total_size);
+ if (!new_mc) {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- struct extended_sigtable ext_header;
- struct extended_signature ext_sig;
- int ext_sigcount;
+ /* free previous update file */
+ vfree(uci->mc);
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EINVAL;
- goto out;
- }
- if (copy_from_user(&ext_header, user_buffer + cursor
- + MC_HEADER_SIZE + data_size, EXT_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- if (ext_table_size != exttable_size(&ext_header)) {
- printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
- error = -EFAULT;
- goto out;
- }
-
- ext_sigcount = ext_header.count;
-
- for (i = 0; i < ext_sigcount; i++) {
- if (copy_from_user(&ext_sig, user_buffer + cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE
- + EXT_SIGNATURE_SIZE * i, EXT_SIGNATURE_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- error = -EFAULT;
- goto out;
- }
- for_each_online_cpu(cpu_num) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
- if (sigmatch(ext_sig.sig, uci->sig, ext_sig.pf, uci->orig_pf)) {
- mark_microcode_update(cpu_num, &mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
- }
- }
- }
- }
- /* now check if any cpu has matched */
- allocated_flag = 0;
- sum = 0;
- for_each_online_cpu(cpu_num) {
- if (ucode_cpu_info[cpu_num].err == MC_MARKED) {
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (!allocated_flag) {
- allocated_flag = 1;
- newmc = vmalloc(total_size);
- if (!newmc) {
- printk(KERN_ERR "microcode: error! Can not allocate memory\n");
- error = -ENOMEM;
- goto out;
- }
- if (copy_from_user(newmc + MC_HEADER_SIZE,
- user_buffer + cursor + MC_HEADER_SIZE,
- total_size - MC_HEADER_SIZE)) {
- printk(KERN_ERR "microcode: error! Can not read user data\n");
- vfree(newmc);
- error = -EFAULT;
- goto out;
- }
- memcpy(newmc, &mc_header, MC_HEADER_SIZE);
- /* check extended table checksum */
- if (ext_table_size) {
- int ext_table_sum = 0;
- int * ext_tablep = (((void *) newmc) + MC_HEADER_SIZE + data_size);
- i = ext_table_size / DWSIZE;
- while (i--) ext_table_sum += ext_tablep[i];
- if (ext_table_sum) {
- printk(KERN_WARNING "microcode: aborting, bad extended signature table checksum\n");
- vfree(newmc);
- error = -EINVAL;
- goto out;
- }
- }
-
- /* calculate the checksum */
- i = (MC_HEADER_SIZE + data_size) / DWSIZE;
- while (i--) sum += ((int *)newmc)[i];
- sum -= (mc_header.sig + mc_header.pf + mc_header.cksum);
- }
- ucode_cpu_info[cpu_num].mc = newmc;
- ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* mc updated */
- if (sum + uci->sig + uci->pf + uci->cksum != 0) {
- printk(KERN_ERR "microcode: CPU%d aborting, bad checksum\n", cpu_num);
- error = -EINVAL;
- goto out;
- }
- }
- }
- cursor += total_size; /* goto the next update patch */
- } /* end of while */
-out:
- return error;
+ memcpy(new_mc, mc, total_size);
+ uci->mc = new_mc;
+ return 1;
}
-static void do_update_one (void * unused)
+static void apply_microcode(int cpu)
{
unsigned long flags;
unsigned int val[2];
- int cpu_num = smp_processor_id();
+ int cpu_num = raw_smp_processor_id();
struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- if (uci->mc == NULL) {
- if (verbose) {
- if (uci->err == MC_SUCCESS)
- printk(KERN_INFO "microcode: CPU%d already at revision 0x%x\n",
- cpu_num, uci->rev);
- else
- printk(KERN_INFO "microcode: No new microcode data for CPU%d\n", cpu_num);
- }
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu_num != cpu);
+
+ if (uci->mc == NULL)
return;
- }
/* serialize access to the physical write to MSR 0x79 */
spin_lock_irqsave(&microcode_update_lock, flags);
@@ -408,68 +335,107 @@ static void do_update_one (void * unused)
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
- /* notify the caller of success on this cpu */
- uci->err = MC_SUCCESS;
spin_unlock_irqrestore(&microcode_update_lock, flags);
- printk(KERN_INFO "microcode: CPU%d updated from revision "
+ if (val[1] != uci->mc->hdr.rev) {
+ printk(KERN_ERR "microcode: CPU%d updated from revision "
+ "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
+ return;
+ }
+ pr_debug("microcode: CPU%d updated from revision "
"0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
- return;
+ uci->rev = val[1];
}
-static int do_microcode_update (void)
-{
- int i, error;
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static void __user *user_buffer; /* user area microcode data buffer */
+static unsigned int user_buffer_size; /* it's size */
- if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
- goto out;
+static long get_next_ucode(void **mc, long offset)
+{
+ microcode_header_t mc_header;
+ unsigned long total_size;
+
+ /* No more data */
+ if (offset >= user_buffer_size)
+ return 0;
+ if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
+ printk(KERN_ERR "microcode: error! Can not read user data\n");
+ return -EFAULT;
}
-
- if ((error = find_matching_ucodes())) {
- printk(KERN_ERR "microcode: Error in the microcode data\n");
- goto out_free;
+ total_size = get_totalsize(&mc_header);
+ if (offset + total_size > user_buffer_size) {
+ printk(KERN_ERR "microcode: error! Bad total size in microcode "
+ "data file\n");
+ return -EINVAL;
}
-
- if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
- printk(KERN_ERR "microcode: Error! Could not run on all processors\n");
- error = -EIO;
+ *mc = vmalloc(total_size);
+ if (!*mc)
+ return -ENOMEM;
+ if (copy_from_user(*mc, user_buffer + offset, total_size)) {
+ printk(KERN_ERR "microcode: error! Can not read user data\n");
+ vfree(*mc);
+ return -EFAULT;
}
+ return offset + total_size;
+}
+
+static int do_microcode_update (void)
+{
+ long cursor = 0;
+ int error = 0;
+ void *new_mc;
+ int cpu;
+ cpumask_t old;
+
+ old = current->cpus_allowed;
-out_free:
- for_each_online_cpu(i) {
- if (ucode_cpu_info[i].mc) {
- int j;
- void *tmp = ucode_cpu_info[i].mc;
- vfree(tmp);
- for_each_online_cpu(j) {
- if (ucode_cpu_info[j].mc == tmp)
- ucode_cpu_info[j].mc = NULL;
- }
+ while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
+ error = microcode_sanity_check(new_mc);
+ if (error)
+ goto out;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ for_each_online_cpu(cpu) {
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!uci->valid)
+ continue;
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ error = get_maching_microcode(new_mc, cpu);
+ if (error < 0)
+ goto out;
+ if (error == 1)
+ apply_microcode(cpu);
}
- if (ucode_cpu_info[i].err == MC_IGNORED && verbose)
- printk(KERN_WARNING "microcode: CPU%d not 'upgrading' to earlier revision"
- " 0x%x (current=0x%x)\n", i, ucode_cpu_info[i].cksum, ucode_cpu_info[i].rev);
+ vfree(new_mc);
}
out:
+ if (cursor > 0)
+ vfree(new_mc);
+ if (cursor < 0)
+ error = cursor;
+ set_cpus_allowed(current, old);
return error;
}
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
{
ssize_t ret;
- if (len < DEFAULT_UCODE_TOTALSIZE) {
- printk(KERN_ERR "microcode: not enough data\n");
- return -EINVAL;
- }
-
if ((len >> PAGE_SHIFT) > num_physpages) {
printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
return -EINVAL;
}
+ lock_cpu_hotplug();
mutex_lock(&microcode_mutex);
user_buffer = (void __user *) buf;
@@ -480,6 +446,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
ret = (ssize_t)len;
mutex_unlock(&microcode_mutex);
+ unlock_cpu_hotplug();
return ret;
}
@@ -493,11 +460,10 @@ static struct file_operations microcode_fops = {
static struct miscdevice microcode_dev = {
.minor = MICROCODE_MINOR,
.name = "microcode",
- .devfs_name = "cpu/microcode",
.fops = &microcode_fops,
};
-static int __init microcode_init (void)
+static int __init microcode_dev_init (void)
{
int error;
@@ -509,6 +475,280 @@ static int __init microcode_init (void)
return error;
}
+ return 0;
+}
+
+static void __exit microcode_dev_exit (void)
+{
+ misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while(0)
+#endif
+
+static long get_next_ucode_from_buffer(void **mc, void *buf,
+ unsigned long size, long offset)
+{
+ microcode_header_t *mc_header;
+ unsigned long total_size;
+
+ /* No more data */
+ if (offset >= size)
+ return 0;
+ mc_header = (microcode_header_t *)(buf + offset);
+ total_size = get_totalsize(mc_header);
+
+ if (offset + total_size > size) {
+ printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+ return -EINVAL;
+ }
+
+ *mc = vmalloc(total_size);
+ if (!*mc) {
+ printk(KERN_ERR "microcode: error! Can not allocate memory\n");
+ return -ENOMEM;
+ }
+ memcpy(*mc, buf + offset, total_size);
+ return offset + total_size;
+}
+
+/* fake device for request_firmware */
+static struct platform_device *microcode_pdev;
+
+static int cpu_request_microcode(int cpu)
+{
+ char name[30];
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ const struct firmware *firmware;
+ void *buf;
+ unsigned long size;
+ long offset = 0;
+ int error;
+ void *mc;
+
+ /* We should bind the task to the CPU */
+ BUG_ON(cpu != raw_smp_processor_id());
+ sprintf(name,"intel-ucode/%02x-%02x-%02x",
+ c->x86, c->x86_model, c->x86_mask);
+ error = request_firmware(&firmware, name, &microcode_pdev->dev);
+ if (error) {
+ pr_debug("ucode data file %s load failed\n", name);
+ return error;
+ }
+ buf = (void *)firmware->data;
+ size = firmware->size;
+ while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
+ > 0) {
+ error = microcode_sanity_check(mc);
+ if (error)
+ break;
+ error = get_maching_microcode(mc, cpu);
+ if (error < 0)
+ break;
+ /*
+ * It's possible the data file has multiple matching ucode,
+ * lets keep searching till the latest version
+ */
+ if (error == 1) {
+ apply_microcode(cpu);
+ error = 0;
+ }
+ vfree(mc);
+ }
+ if (offset > 0)
+ vfree(mc);
+ if (offset < 0)
+ error = offset;
+ release_firmware(firmware);
+
+ return error;
+}
+
+static void microcode_init_cpu(int cpu)
+{
+ cpumask_t old;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ old = current->cpus_allowed;
+
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ mutex_lock(&microcode_mutex);
+ collect_cpu_info(cpu);
+ if (uci->valid)
+ cpu_request_microcode(cpu);
+ mutex_unlock(&microcode_mutex);
+ set_cpus_allowed(current, old);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ mutex_lock(&microcode_mutex);
+ uci->valid = 0;
+ vfree(uci->mc);
+ uci->mc = NULL;
+ mutex_unlock(&microcode_mutex);
+}
+
+static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+ char *end;
+ unsigned long val = simple_strtoul(buf, &end, 0);
+ int err = 0;
+ int cpu = dev->id;
+
+ if (end == buf)
+ return -EINVAL;
+ if (val == 1) {
+ cpumask_t old;
+
+ old = current->cpus_allowed;
+
+ lock_cpu_hotplug();
+ set_cpus_allowed(current, cpumask_of_cpu(cpu));
+
+ mutex_lock(&microcode_mutex);
+ if (uci->valid)
+ err = cpu_request_microcode(cpu);
+ mutex_unlock(&microcode_mutex);
+ unlock_cpu_hotplug();
+ set_cpus_allowed(current, old);
+ }
+ if (err)
+ return err;
+ return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev, char *buf)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+ return sprintf(buf, "0x%x\n", uci->pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+ &attr_reload.attr,
+ &attr_version.attr,
+ &attr_processor_flags.attr,
+ NULL
+};
+
+static struct attribute_group mc_attr_group = {
+ .attrs = mc_default_attrs,
+ .name = "microcode",
+};
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+ int cpu = sys_dev->id;
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d added\n", cpu);
+ memset(uci, 0, sizeof(*uci));
+ sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+
+ microcode_init_cpu(cpu);
+ return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+ int cpu = sys_dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d removed\n", cpu);
+ microcode_fini_cpu(cpu);
+ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+ int cpu = dev->id;
+
+ if (!cpu_online(cpu))
+ return 0;
+ pr_debug("Microcode:CPU %d resumed\n", cpu);
+ /* only CPU 0 will apply ucode here */
+ apply_microcode(0);
+ return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+ .add = mc_sysdev_add,
+ .remove = mc_sysdev_remove,
+ .resume = mc_sysdev_resume,
+};
+
+#ifdef CONFIG_HOTPLUG_CPU
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+ struct sys_device *sys_dev;
+
+ sys_dev = get_cpu_sysdev(cpu);
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ mc_sysdev_add(sys_dev);
+ break;
+ case CPU_DOWN_PREPARE:
+ mc_sysdev_remove(sys_dev);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block mc_cpu_notifier = {
+ .notifier_call = mc_cpu_callback,
+};
+#endif
+
+static int __init microcode_init (void)
+{
+ int error;
+
+ error = microcode_dev_init();
+ if (error)
+ return error;
+ microcode_pdev = platform_device_register_simple("microcode", -1,
+ NULL, 0);
+ if (IS_ERR(microcode_pdev)) {
+ microcode_dev_exit();
+ return PTR_ERR(microcode_pdev);
+ }
+
+ lock_cpu_hotplug();
+ error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ unlock_cpu_hotplug();
+ if (error) {
+ microcode_dev_exit();
+ platform_device_unregister(microcode_pdev);
+ return error;
+ }
+
+ register_hotcpu_notifier(&mc_cpu_notifier);
+
printk(KERN_INFO
"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n");
return 0;
@@ -516,9 +756,16 @@ static int __init microcode_init (void)
static void __exit microcode_exit (void)
{
- misc_deregister(&microcode_dev);
+ microcode_dev_exit();
+
+ unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+ lock_cpu_hotplug();
+ sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ unlock_cpu_hotplug();
+
+ platform_device_unregister(microcode_pdev);
}
module_init(microcode_init)
module_exit(microcode_exit)
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 6b1392d33ed5..442aaf8c77eb 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -17,7 +17,6 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <linux/delay.h>
-#include <linux/config.h>
#include <linux/bootmem.h>
#include <linux/smp_lock.h>
#include <linux/kernel_stat.h>
@@ -31,6 +30,7 @@
#include <asm/io_apic.h>
#include <mach_apic.h>
+#include <mach_apicdef.h>
#include <mach_mpparse.h>
#include <bios_ebda.h>
@@ -69,7 +69,7 @@ unsigned int def_to_bigsmp = 0;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
/* Internal processor count */
-static unsigned int __devinitdata num_processors;
+unsigned int __cpuinitdata num_processors;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map;
@@ -229,12 +229,14 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
" is too large, max. supported is %d\n",
m->mpc_busid, str, MAX_MP_BUSSES - 1);
return;
}
+#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
@@ -294,19 +296,6 @@ static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
m->mpc_irqtype, m->mpc_irqflag & 3,
(m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
- /*
- * Well it seems all SMP boards in existence
- * use ExtINT/LVT1 == LINT0 and
- * NMI/LVT2 == LINT1 - the following check
- * will show us if this assumptions is false.
- * Until then we do not have to add baggage.
- */
- if ((m->mpc_irqtype == mp_ExtINT) &&
- (m->mpc_destapiclint != 0))
- BUG();
- if ((m->mpc_irqtype == mp_NMI) &&
- (m->mpc_destapiclint != 1))
- BUG();
}
#ifdef CONFIG_X86_NUMAQ
@@ -823,8 +812,7 @@ int es7000_plat;
#ifdef CONFIG_ACPI
-void __init mp_register_lapic_address (
- u64 address)
+void __init mp_register_lapic_address(u64 address)
{
mp_lapic_addr = (unsigned long) address;
@@ -836,13 +824,10 @@ void __init mp_register_lapic_address (
Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
}
-
-void __devinit mp_register_lapic (
- u8 id,
- u8 enabled)
+void __devinit mp_register_lapic (u8 id, u8 enabled)
{
struct mpc_config_processor processor;
- int boot_cpu = 0;
+ int boot_cpu = 0;
if (MAX_APICS - id <= 0) {
printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
@@ -879,11 +864,9 @@ static struct mp_ioapic_routing {
u32 pin_programmed[4];
} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic (
- int gsi)
+static int mp_find_ioapic (int gsi)
{
- int i = 0;
+ int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
@@ -896,15 +879,11 @@ static int mp_find_ioapic (
return -1;
}
-
-void __init mp_register_ioapic (
- u8 id,
- u32 address,
- u32 gsi_base)
+void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
{
- int idx = 0;
- int tmpid;
+ int idx = 0;
+ int tmpid;
if (nr_ioapics >= MAX_IO_APICS) {
printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
@@ -950,16 +929,10 @@ void __init mp_register_ioapic (
mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
mp_ioapic_routing[idx].gsi_base,
mp_ioapic_routing[idx].gsi_end);
-
- return;
}
-
-void __init mp_override_legacy_irq (
- u8 bus_irq,
- u8 polarity,
- u8 trigger,
- u32 gsi)
+void __init
+mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
int ioapic = -1;
@@ -997,15 +970,13 @@ void __init mp_override_legacy_irq (
mp_irqs[mp_irq_entries] = intsrc;
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!\n");
-
- return;
}
void __init mp_config_acpi_legacy_irqs (void)
{
struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
+ int i = 0;
+ int ioapic = -1;
/*
* Fabricate the legacy ISA bus (bus #31).
@@ -1074,12 +1045,12 @@ void __init mp_config_acpi_legacy_irqs (void)
#define MAX_GSI_NUM 4096
-int mp_register_gsi (u32 gsi, int triggering, int polarity)
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
- static int pci_irq = 16;
+ int ioapic = -1;
+ int ioapic_pin = 0;
+ int idx, bit = 0;
+ static int pci_irq = 16;
/*
* Mapping between Global System Interrups, which
* represent all possible interrupts, and IRQs
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 7a328230e540..d535cdbbfd25 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -24,7 +24,6 @@
*/
#include <linux/module.h>
-#include <linux/config.h>
#include <linux/types.h>
#include <linux/errno.h>
@@ -251,7 +250,9 @@ static int msr_class_device_create(int i)
return err;
}
-static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+#ifdef CONFIG_HOTPLUG_CPU
+static int msr_class_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
@@ -266,10 +267,11 @@ static int msr_class_cpu_callback(struct notifier_block *nfb, unsigned long acti
return NOTIFY_OK;
}
-static struct notifier_block msr_class_cpu_notifier =
+static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
{
.notifier_call = msr_class_cpu_callback,
};
+#endif
static int __init msr_init(void)
{
@@ -292,7 +294,7 @@ static int __init msr_init(void)
if (err != 0)
goto out_class;
}
- register_cpu_notifier(&msr_class_cpu_notifier);
+ register_hotcpu_notifier(&msr_class_cpu_notifier);
err = 0;
goto out;
@@ -315,7 +317,7 @@ static void __exit msr_exit(void)
class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
class_destroy(msr_class);
unregister_chrdev(MSR_MAJOR, "cpu/msr");
- unregister_cpu_notifier(&msr_class_cpu_notifier);
+ unregister_hotcpu_notifier(&msr_class_cpu_notifier);
}
module_init(msr_init);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d43b498ec745..0fc4997fb143 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -14,91 +14,184 @@
*/
#include <linux/config.h>
-#include <linux/mm.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/sysdev.h>
#include <linux/sysctl.h>
+#include <linux/percpu.h>
+#include <linux/dmi.h>
+#include <linux/kprobes.h>
#include <asm/smp.h>
-#include <asm/div64.h>
#include <asm/nmi.h>
+#include <asm/kdebug.h>
+#include <asm/intel_arch_perfmon.h>
#include "mach_traps.h"
-unsigned int nmi_watchdog = NMI_NONE;
-extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
-static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
-static unsigned int nmi_p4_cccr_val;
-extern void show_registers(struct pt_regs *regs);
+int unknown_nmi_panic;
+int nmi_watchdog_enabled;
-/*
- * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
- * - it may be reserved by some other driver, or not
- * - when not reserved by some other driver, it may be used for
- * the NMI watchdog, or not
- *
- * This is maintained separately from nmi_active because the NMI
- * watchdog may also be driven from the I/O APIC timer.
+/* perfctr_nmi_owner tracks the ownership of the perfctr registers:
+ * evtsel_nmi_owner tracks the ownership of the event selection
+ * - different performance counters/ event selection may be reserved for
+ * different subsystems this reservation system just tries to coordinate
+ * things a little
*/
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
-static unsigned int lapic_nmi_owner;
-#define LAPIC_NMI_WATCHDOG (1<<0)
-#define LAPIC_NMI_RESERVED (1<<1)
+static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
+static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
+
+/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
+ * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
+ */
+#define NMI_MAX_COUNTER_BITS 66
/* nmi_active:
- * +1: the lapic NMI watchdog is active, but can be disabled
- * 0: the lapic NMI watchdog has not been set up, and cannot
+ * >0: the lapic NMI watchdog is active, but can be disabled
+ * <0: the lapic NMI watchdog has not been set up, and cannot
* be enabled
- * -1: the lapic NMI watchdog is disabled, but can be enabled
+ * 0: the lapic NMI watchdog is disabled, but can be enabled
*/
-int nmi_active;
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+unsigned int nmi_watchdog = NMI_DEFAULT;
+static unsigned int nmi_hz = HZ;
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+struct nmi_watchdog_ctlblk {
+ int enabled;
+ u64 check_bit;
+ unsigned int cccr_msr;
+ unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
+ unsigned int evntsel_msr; /* the MSR to select the events to handle */
+};
+static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-#define MSR_P4_MISC_ENABLE 0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
-#define MSR_P4_PERFCTR0 0x300
-#define MSR_P4_CCCR0 0x360
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI0 (1<<26)
-#define P4_CCCR_OVF_PMI1 (1<<27)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0 0x30C
-#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
- P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+/* local prototypes */
+static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu);
+
+extern void show_registers(struct pt_regs *regs);
+extern int unknown_nmi_panic;
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the performance counter register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_PERFCTR0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_PERFCTR0);
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return (msr - MSR_P6_PERFCTR0);
+ case 15:
+ return (msr - MSR_P4_BPU_PERFCTR0);
+ }
+ }
+ return 0;
+}
+
+/* converts an msr to an appropriate reservation bit */
+static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
+{
+ /* returns the bit offset of the event selection register */
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return (msr - MSR_K7_EVNTSEL0);
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return (msr - MSR_ARCH_PERFMON_EVENTSEL0);
+
+ switch (boot_cpu_data.x86) {
+ case 6:
+ return (msr - MSR_P6_EVNTSEL0);
+ case 15:
+ return (msr - MSR_P4_BSU_ESCR0);
+ }
+ }
+ return 0;
+}
+
+/* checks for a bit availability (hack for oprofile) */
+int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
+{
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+/* checks the an msr for availability */
+int avail_to_resrv_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ return (!test_bit(counter, &__get_cpu_var(perfctr_nmi_owner)));
+}
+
+int reserve_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(perfctr_nmi_owner)))
+ return 1;
+ return 0;
+}
+
+void release_perfctr_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_perfctr_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(perfctr_nmi_owner));
+}
+
+int reserve_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ if (!test_and_set_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]))
+ return 1;
+ return 0;
+}
+
+void release_evntsel_nmi(unsigned int msr)
+{
+ unsigned int counter;
+
+ counter = nmi_evntsel_msr_to_bit(msr);
+ BUG_ON(counter > NMI_MAX_COUNTER_BITS);
+
+ clear_bit(counter, &__get_cpu_var(evntsel_nmi_owner)[0]);
+}
+
+static __cpuinit inline int nmi_known_cpu(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+ return 1;
+ else
+ return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6));
+ }
+ return 0;
+}
#ifdef CONFIG_SMP
/* The performance counters used by NMI_LOCAL_APIC don't trigger when
@@ -108,7 +201,7 @@ int nmi_active;
static __init void nmi_cpu_busy(void *data)
{
volatile int *endflag = data;
- local_irq_enable();
+ local_irq_enable_in_hardirq();
/* Intentionally don't use cpu_relax here. This is
to make sure that the performance counter really ticks,
even if there is a simulator or similar that catches the
@@ -126,7 +219,18 @@ static int __init check_nmi_watchdog(void)
unsigned int *prev_nmi_count;
int cpu;
- if (nmi_watchdog == NMI_NONE)
+ /* Enable NMI watchdog for newer systems.
+ Actually it should be safe for most systems before 2004 too except
+ for some IBM systems that corrupt registers when NMI happens
+ during SMM. Unfortunately we don't have more exact information
+ on these and use this coarse check. */
+ if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004)
+ nmi_watchdog = NMI_LOCAL_APIC;
+
+ if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
+ return 0;
+
+ if (!atomic_read(&nmi_active))
return 0;
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
@@ -150,25 +254,45 @@ static int __init check_nmi_watchdog(void)
if (!cpu_isset(cpu, cpu_callin_map))
continue;
#endif
+ if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled)
+ continue;
if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
- endflag = 1;
printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
cpu,
prev_nmi_count[cpu],
nmi_count(cpu));
- nmi_active = 0;
- lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
- kfree(prev_nmi_count);
- return -1;
+ per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0;
+ atomic_dec(&nmi_active);
}
}
+ if (!atomic_read(&nmi_active)) {
+ kfree(prev_nmi_count);
+ atomic_set(&nmi_active, -1);
+ return -1;
+ }
endflag = 1;
printk("OK.\n");
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
nmi_hz = 1;
+ /*
+ * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter
+ * are writable, with higher bits sign extending from bit 31.
+ * So, we can only program the counter with 31 bit values and
+ * 32nd bit should be 1, for 33.. to be 1.
+ * Find the appropriate nmi_hz
+ */
+ if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
+ ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
+ u64 count = (u64)cpu_khz * 1000;
+ do_div(count, 0x7fffffffUL);
+ nmi_hz = count + 1;
+ }
+ }
kfree(prev_nmi_count);
return 0;
@@ -182,31 +306,16 @@ static int __init setup_nmi_watchdog(char *str)
get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
+ if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE))
return 0;
- if (nmi == NMI_NONE)
- nmi_watchdog = nmi;
/*
* If any other x86 CPU has a local APIC, then
* please test the NMI stuff there and send me the
* missing bits. Right now Intel P6/P4 and AMD K7 only.
*/
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
- nmi_watchdog = nmi;
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
- (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
- nmi_watchdog = nmi;
- /*
- * We can enable the IO-APIC watchdog
- * unconditionally.
- */
- if (nmi == NMI_IO_APIC) {
- nmi_active = 1;
- nmi_watchdog = nmi;
- }
+ if ((nmi == NMI_LOCAL_APIC) && (nmi_known_cpu() == 0))
+ return 0; /* no lapic support */
+ nmi_watchdog = nmi;
return 1;
}
@@ -214,86 +323,53 @@ __setup("nmi_watchdog=", setup_nmi_watchdog);
static void disable_lapic_nmi_watchdog(void)
{
- if (nmi_active <= 0)
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- wrmsr(MSR_K7_EVNTSEL0, 0, 0);
- break;
- case X86_VENDOR_INTEL:
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
- break;
- wrmsr(MSR_P6_EVNTSEL0, 0, 0);
- break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
- break;
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
- wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
- wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
- break;
- }
- break;
- }
- nmi_active = -1;
- /* tell do_nmi() and others that we're not active any more */
- nmi_watchdog = 0;
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
static void enable_lapic_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_LOCAL_APIC;
- setup_apic_nmi_watchdog();
- }
-}
-
-int reserve_lapic_nmi(void)
-{
- unsigned int old_owner;
+ BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
- spin_lock(&lapic_nmi_owner_lock);
- old_owner = lapic_nmi_owner;
- lapic_nmi_owner |= LAPIC_NMI_RESERVED;
- spin_unlock(&lapic_nmi_owner_lock);
- if (old_owner & LAPIC_NMI_RESERVED)
- return -EBUSY;
- if (old_owner & LAPIC_NMI_WATCHDOG)
- disable_lapic_nmi_watchdog();
- return 0;
-}
+ /* are we already enabled */
+ if (atomic_read(&nmi_active) != 0)
+ return;
-void release_lapic_nmi(void)
-{
- unsigned int new_owner;
+ /* are we lapic aware */
+ if (nmi_known_cpu() <= 0)
+ return;
- spin_lock(&lapic_nmi_owner_lock);
- new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
- lapic_nmi_owner = new_owner;
- spin_unlock(&lapic_nmi_owner_lock);
- if (new_owner & LAPIC_NMI_WATCHDOG)
- enable_lapic_nmi_watchdog();
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ touch_nmi_watchdog();
}
void disable_timer_nmi_watchdog(void)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) <= 0)
return;
- unset_nmi_callback();
- nmi_active = -1;
- nmi_watchdog = NMI_NONE;
+ disable_irq(0);
+ on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+
+ BUG_ON(atomic_read(&nmi_active) != 0);
}
void enable_timer_nmi_watchdog(void)
{
- if (nmi_active < 0) {
- nmi_watchdog = NMI_IO_APIC;
+ BUG_ON(nmi_watchdog != NMI_IO_APIC);
+
+ if (atomic_read(&nmi_active) == 0) {
touch_nmi_watchdog();
- nmi_active = 1;
+ on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+ enable_irq(0);
}
}
@@ -303,15 +379,20 @@ static int nmi_pm_active; /* nmi_active before suspend */
static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
{
- nmi_pm_active = nmi_active;
- disable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ nmi_pm_active = atomic_read(&nmi_active);
+ stop_apic_nmi_watchdog(NULL);
+ BUG_ON(atomic_read(&nmi_active) != 0);
return 0;
}
static int lapic_nmi_resume(struct sys_device *dev)
{
- if (nmi_pm_active > 0)
- enable_lapic_nmi_watchdog();
+ /* only CPU0 goes here, other CPUs should be offline */
+ if (nmi_pm_active > 0) {
+ setup_apic_nmi_watchdog(NULL);
+ touch_nmi_watchdog();
+ }
return 0;
}
@@ -331,7 +412,13 @@ static int __init init_lapic_nmi_sysfs(void)
{
int error;
- if (nmi_active == 0 || nmi_watchdog != NMI_LOCAL_APIC)
+ /* should really be a BUG_ON but b/c this is an
+ * init call, it just doesn't work. -dcz
+ */
+ if (nmi_watchdog != NMI_LOCAL_APIC)
+ return 0;
+
+ if ( atomic_read(&nmi_active) < 0 )
return 0;
error = sysdev_class_register(&nmi_sysclass);
@@ -349,138 +436,415 @@ late_initcall(init_lapic_nmi_sysfs);
* Original code written by Keith Owens.
*/
-static void clear_msr_range(unsigned int base, unsigned int n)
-{
- unsigned int i;
-
- for(i = 0; i < n; ++i)
- wrmsr(base+i, 0, 0);
-}
-
-static void write_watchdog_counter(const char *descr)
+static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
{
u64 count = (u64)cpu_khz * 1000;
do_div(count, nmi_hz);
if(descr)
Dprintk("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(nmi_perfctr_msr, 0 - count);
+ wrmsrl(perfctr_msr, 0 - count);
}
-static void setup_k7_watchdog(void)
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+static int setup_k7_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ perfctr_msr = MSR_K7_PERFCTR0;
+ evntsel_msr = MSR_K7_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- clear_msr_range(MSR_K7_EVNTSEL0, 4);
- clear_msr_range(MSR_K7_PERFCTR0, 4);
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
evntsel = K7_EVNTSEL_INT
| K7_EVNTSEL_OS
| K7_EVNTSEL_USR
| K7_NMI_EVENT;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- write_watchdog_counter("K7_PERFCTR0");
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "K7_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<63;
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-static void setup_p6_watchdog(void)
+static void stop_k7_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+static int setup_p6_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr;
unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- nmi_perfctr_msr = MSR_P6_PERFCTR0;
+ perfctr_msr = MSR_P6_PERFCTR0;
+ evntsel_msr = MSR_P6_EVNTSEL0;
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
- clear_msr_range(MSR_P6_EVNTSEL0, 2);
- clear_msr_range(MSR_P6_PERFCTR0, 2);
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
evntsel = P6_EVNTSEL_INT
| P6_EVNTSEL_OS
| P6_EVNTSEL_USR
| P6_NMI_EVENT;
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
- write_watchdog_counter("P6_PERFCTR0");
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "P6_PERFCTR0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL<<39;
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
+static void stop_p6_watchdog(void)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+/* Note that these events don't tick when the CPU idles. This means
+ the frequency varies with CPU load. */
+
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+#define P4_CCCR_OVF (1<<31)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+
static int setup_p4_watchdog(void)
{
+ unsigned int perfctr_msr, evntsel_msr, cccr_msr;
+ unsigned int evntsel, cccr_val;
unsigned int misc_enable, dummy;
+ unsigned int ht_num;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;
- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
- nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
#ifdef CONFIG_SMP
- if (smp_num_siblings == 2)
- nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+ /* detect which hyperthread we are on */
+ if (smp_num_siblings == 2) {
+ unsigned int ebx, apicid;
+
+ ebx = cpuid_ebx(1);
+ apicid = (ebx >> 24) & 0xff;
+ ht_num = apicid & 1;
+ } else
#endif
+ ht_num = 0;
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
- clear_msr_range(0x3F1, 2);
- /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
- docs doesn't fully define it, so leave it alone for now. */
- if (boot_cpu_data.x86_model >= 0x3) {
- /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
- clear_msr_range(0x3A0, 26);
- clear_msr_range(0x3BC, 3);
+ /* performance counters are shared resources
+ * assign each hyperthread its own set
+ * (re-use the ESCR0 register, seems safe
+ * and keeps the cccr_val the same)
+ */
+ if (!ht_num) {
+ /* logical cpu 0 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR0;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR0;
+ cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
} else {
- clear_msr_range(0x3A0, 31);
+ /* logical cpu 1 */
+ perfctr_msr = MSR_P4_IQ_PERFCTR1;
+ evntsel_msr = MSR_P4_CRU_ESCR0;
+ cccr_msr = MSR_P4_IQ_CCCR1;
+ cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
}
- clear_msr_range(0x3C0, 6);
- clear_msr_range(0x3C8, 6);
- clear_msr_range(0x3E0, 2);
- clear_msr_range(MSR_P4_CCCR0, 18);
- clear_msr_range(MSR_P4_PERFCTR0, 18);
-
- wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- write_watchdog_counter("P4_IQ_COUNTER0");
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ evntsel = P4_ESCR_EVENT_SELECT(0x3F)
+ | P4_ESCR_OS
+ | P4_ESCR_USR;
+
+ cccr_val |= P4_CCCR_THRESHOLD(15)
+ | P4_CCCR_COMPLEMENT
+ | P4_CCCR_COMPARE
+ | P4_CCCR_REQUIRED;
+
+ wrmsr(evntsel_msr, evntsel, 0);
+ wrmsr(cccr_msr, cccr_val, 0);
+ write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0");
apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ cccr_val |= P4_CCCR_ENABLE;
+ wrmsr(cccr_msr, cccr_val, 0);
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = cccr_msr;
+ wd->check_bit = 1ULL<<39;
return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
}
-void setup_apic_nmi_watchdog (void)
+static void stop_p4_watchdog(void)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
- return;
- setup_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 0xd)
- return;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- setup_p6_watchdog();
- break;
- case 15:
- if (boot_cpu_data.x86_model > 0x4)
+ wrmsr(wd->cccr_msr, 0, 0);
+ wrmsr(wd->evntsel_msr, 0, 0);
+
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
+#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
+
+static int setup_intel_arch_watchdog(void)
+{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ unsigned int perfctr_msr, evntsel_msr;
+ unsigned int evntsel;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ goto fail;
+
+ perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0;
+ evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0;
+
+ if (!reserve_perfctr_nmi(perfctr_msr))
+ goto fail;
+
+ if (!reserve_evntsel_nmi(evntsel_msr))
+ goto fail1;
+
+ wrmsrl(perfctr_msr, 0UL);
+
+ evntsel = ARCH_PERFMON_EVENTSEL_INT
+ | ARCH_PERFMON_EVENTSEL_OS
+ | ARCH_PERFMON_EVENTSEL_USR
+ | ARCH_PERFMON_NMI_EVENT_SEL
+ | ARCH_PERFMON_NMI_EVENT_UMASK;
+
+ /* setup the timer */
+ wrmsr(evntsel_msr, evntsel, 0);
+ write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0");
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+ wrmsr(evntsel_msr, evntsel, 0);
+
+ wd->perfctr_msr = perfctr_msr;
+ wd->evntsel_msr = evntsel_msr;
+ wd->cccr_msr = 0; //unused
+ wd->check_bit = 1ULL << (eax.split.bit_width - 1);
+ return 1;
+fail1:
+ release_perfctr_nmi(perfctr_msr);
+fail:
+ return 0;
+}
+
+static void stop_intel_arch_watchdog(void)
+{
+ unsigned int ebx;
+ union cpuid10_eax eax;
+ unsigned int unused;
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /*
+ * Check whether the Architectural PerfMon supports
+ * Unhalted Core Cycles Event or not.
+ * NOTE: Corresponding bit = 0 in ebx indicates event present.
+ */
+ cpuid(10, &(eax.full), &ebx, &unused, &unused);
+ if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
+ (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
+ return;
+
+ wrmsr(wd->evntsel_msr, 0, 0);
+ release_evntsel_nmi(wd->evntsel_msr);
+ release_perfctr_nmi(wd->perfctr_msr);
+}
+
+void setup_apic_nmi_watchdog (void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 1)
+ return;
+
+ /* cheap hack to support suspend/resume */
+ /* if cpu0 is not active neither should the other cpus */
+ if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0))
+ return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+ return;
+ if (!setup_k7_watchdog())
return;
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ if (!setup_intel_arch_watchdog())
+ return;
+ break;
+ }
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ return;
+
+ if (!setup_p6_watchdog())
+ return;
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x4)
+ return;
- if (!setup_p4_watchdog())
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
return;
+ }
break;
default:
return;
}
- break;
- default:
+ }
+ wd->enabled = 1;
+ atomic_inc(&nmi_active);
+}
+
+void stop_apic_nmi_watchdog(void *unused)
+{
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+
+ /* only support LOCAL and IO APICs for now */
+ if ((nmi_watchdog != NMI_LOCAL_APIC) &&
+ (nmi_watchdog != NMI_IO_APIC))
+ return;
+
+ if (wd->enabled == 0)
return;
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ stop_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+ stop_intel_arch_watchdog();
+ break;
+ }
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ break;
+ stop_p6_watchdog();
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x4)
+ break;
+ stop_p4_watchdog();
+ break;
+ }
+ break;
+ default:
+ return;
+ }
}
- lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
- nmi_active = 1;
+ wd->enabled = 0;
+ atomic_dec(&nmi_active);
}
/*
@@ -518,10 +882,11 @@ void touch_nmi_watchdog (void)
*/
touch_softlockup_watchdog();
}
+EXPORT_SYMBOL(touch_nmi_watchdog);
extern void die_nmi(struct pt_regs *, const char *msg);
-void nmi_watchdog_tick (struct pt_regs * regs)
+__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
{
/*
@@ -530,11 +895,23 @@ void nmi_watchdog_tick (struct pt_regs * regs)
* smp_processor_id().
*/
unsigned int sum;
+ int touched = 0;
int cpu = smp_processor_id();
+ struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
+ u64 dummy;
+ int rc=0;
+
+ /* check for other users first */
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP) {
+ rc = 1;
+ touched = 1;
+ }
sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
- if (last_irq_sums[cpu] == sum) {
+ /* if the apic timer isn't firing, this cpu isn't doing much */
+ if (!touched && last_irq_sums[cpu] == sum) {
/*
* Ayiee, looks like this CPU is stuck ...
* wait a few IRQs (5 seconds) before doing the oops ...
@@ -549,26 +926,59 @@ void nmi_watchdog_tick (struct pt_regs * regs)
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
+ /* see if the nmi watchdog went off */
+ if (wd->enabled) {
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ rdmsrl(wd->perfctr_msr, dummy);
+ if (dummy & wd->check_bit){
+ /* this wasn't a watchdog timer interrupt */
+ goto done;
+ }
+
+ /* only Intel P4 uses the cccr msr */
+ if (wd->cccr_msr != 0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ rdmsrl(wd->cccr_msr, dummy);
+ dummy &= ~P4_CCCR_OVF;
+ wrmsrl(wd->cccr_msr, dummy);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
+ wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
+ /* P6 based Pentium M need to re-unmask
+ * the apic vector but it doesn't hurt
+ * other P6 variant.
+ * ArchPerfom/Core Duo also needs this */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ /* start the cycle over again */
+ write_watchdog_counter(wd->perfctr_msr, NULL);
+ rc = 1;
+ } else if (nmi_watchdog == NMI_IO_APIC) {
+ /* don't know how to accurately check for this.
+ * just assume it was a watchdog timer interrupt
+ * This matches the old behaviour.
*/
- wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
+ rc = 1;
}
- else if (nmi_perfctr_msr == MSR_P6_PERFCTR0) {
- /* Only P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- write_watchdog_counter(NULL);
}
+done:
+ return rc;
+}
+
+int do_nmi_callback(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_SYSCTL
+ if (unknown_nmi_panic)
+ return unknown_nmi_panic_callback(regs, cpu);
+#endif
+ return 0;
}
#ifdef CONFIG_SYSCTL
@@ -578,36 +988,46 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
unsigned char reason = get_nmi_reason();
char buf[64];
- if (!(reason & 0xc0)) {
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(regs, buf);
- }
+ sprintf(buf, "NMI received for unknown reason %02x\n", reason);
+ die_nmi(regs, buf);
return 0;
}
/*
- * proc handler for /proc/sys/kernel/unknown_nmi_panic
+ * proc handler for /proc/sys/kernel/nmi
*/
-int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
+int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
void __user *buffer, size_t *length, loff_t *ppos)
{
int old_state;
- old_state = unknown_nmi_panic;
+ nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
+ old_state = nmi_watchdog_enabled;
proc_dointvec(table, write, file, buffer, length, ppos);
- if (!!old_state == !!unknown_nmi_panic)
+ if (!!old_state == !!nmi_watchdog_enabled)
return 0;
- if (unknown_nmi_panic) {
- if (reserve_lapic_nmi() < 0) {
- unknown_nmi_panic = 0;
- return -EBUSY;
- } else {
- set_nmi_callback(unknown_nmi_panic_callback);
- }
+ if (atomic_read(&nmi_active) < 0) {
+ printk( KERN_WARNING "NMI watchdog is permanently disabled\n");
+ return -EIO;
+ }
+
+ if (nmi_watchdog == NMI_DEFAULT) {
+ if (nmi_known_cpu() > 0)
+ nmi_watchdog = NMI_LOCAL_APIC;
+ else
+ nmi_watchdog = NMI_IO_APIC;
+ }
+
+ if (nmi_watchdog == NMI_LOCAL_APIC) {
+ if (nmi_watchdog_enabled)
+ enable_lapic_nmi_watchdog();
+ else
+ disable_lapic_nmi_watchdog();
} else {
- release_lapic_nmi();
- unset_nmi_callback();
+ printk( KERN_WARNING
+ "NMI watchdog doesn't know what hardware to touch\n");
+ return -EIO;
}
return 0;
}
@@ -616,7 +1036,11 @@ int proc_unknown_nmi_panic(ctl_table *table, int write, struct file *file,
EXPORT_SYMBOL(nmi_active);
EXPORT_SYMBOL(nmi_watchdog);
-EXPORT_SYMBOL(reserve_lapic_nmi);
-EXPORT_SYMBOL(release_lapic_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
+EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
+EXPORT_SYMBOL(reserve_perfctr_nmi);
+EXPORT_SYMBOL(release_perfctr_nmi);
+EXPORT_SYMBOL(reserve_evntsel_nmi);
+EXPORT_SYMBOL(release_evntsel_nmi);
EXPORT_SYMBOL(disable_timer_nmi_watchdog);
EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c
index 5f5b075f860a..9000d82c6dc0 100644
--- a/arch/i386/kernel/numaq.c
+++ b/arch/i386/kernel/numaq.c
@@ -23,7 +23,6 @@
* Send feedback to <gone@us.ibm.com>
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
@@ -79,10 +78,12 @@ int __init get_memcfg_numaq(void)
return 1;
}
-static int __init numaq_dsc_disable(void)
+static int __init numaq_tsc_disable(void)
{
- printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
- tsc_disable = 1;
+ if (num_online_nodes() > 1) {
+ printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+ tsc_disable = 1;
+ }
return 0;
}
-core_initcall(numaq_dsc_disable);
+arch_initcall(numaq_tsc_disable);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 6259afea46d1..8c190ca7ae44 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -28,7 +28,6 @@
#include <linux/user.h>
#include <linux/a.out.h>
#include <linux/interrupt.h>
-#include <linux/config.h>
#include <linux/utsname.h>
#include <linux/delay.h>
#include <linux/reboot.h>
@@ -38,6 +37,7 @@
#include <linux/kallsyms.h>
#include <linux/ptrace.h>
#include <linux/random.h>
+#include <linux/personality.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -102,7 +102,7 @@ void default_idle(void)
local_irq_enable();
if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
- clear_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status &= ~TS_POLLING;
smp_mb__after_clear_bit();
while (!need_resched()) {
local_irq_disable();
@@ -111,7 +111,7 @@ void default_idle(void)
else
local_irq_enable();
}
- set_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status |= TS_POLLING;
} else {
while (!need_resched())
cpu_relax();
@@ -174,7 +174,7 @@ void cpu_idle(void)
{
int cpu = smp_processor_id();
- set_thread_flag(TIF_POLLING_NRFLAG);
+ current_thread_info()->status |= TS_POLLING;
/* endless idle loop with no priority at all */
while (1) {
@@ -312,7 +312,7 @@ void show_regs(struct pt_regs * regs)
cr3 = read_cr3();
cr4 = read_cr4_safe();
printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
- show_trace(NULL, &regs->esp);
+ show_trace(NULL, regs, &regs->esp);
}
/*
@@ -321,15 +321,6 @@ void show_regs(struct pt_regs * regs)
* the "args".
*/
extern void kernel_thread_helper(void);
-__asm__(".section .text\n"
- ".align 4\n"
- "kernel_thread_helper:\n\t"
- "movl %edx,%eax\n\t"
- "pushl %edx\n\t"
- "call *%ebx\n\t"
- "pushl %eax\n\t"
- "call do_exit\n"
- ".previous");
/*
* Create a kernel thread
@@ -347,7 +338,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
regs.xes = __USER_DS;
regs.orig_eax = -1;
regs.eip = (unsigned long) kernel_thread_helper;
- regs.xcs = __KERNEL_CS;
+ regs.xcs = __KERNEL_CS | get_kernel_rpl();
regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
/* Ok, create the new process.. */
@@ -360,16 +351,16 @@ EXPORT_SYMBOL(kernel_thread);
*/
void exit_thread(void)
{
- struct task_struct *tsk = current;
- struct thread_struct *t = &tsk->thread;
-
/* The process may have allocated an io port bitmap... nuke it. */
- if (unlikely(NULL != t->io_bitmap_ptr)) {
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
+ struct task_struct *tsk = current;
+ struct thread_struct *t = &tsk->thread;
int cpu = get_cpu();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
kfree(t->io_bitmap_ptr);
t->io_bitmap_ptr = NULL;
+ clear_thread_flag(TIF_IO_BITMAP);
/*
* Careful, clear this in the TSS too:
*/
@@ -388,6 +379,7 @@ void flush_thread(void)
memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+ clear_tsk_thread_flag(tsk, TIF_DEBUG);
/*
* Forget coprocessor state..
*/
@@ -432,7 +424,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
savesegment(gs,p->thread.gs);
tsk = current;
- if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
+ if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
@@ -440,6 +432,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
}
memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES);
+ set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
/*
@@ -534,10 +527,24 @@ int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
return 1;
}
-static inline void
-handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
+static noinline void __switch_to_xtra(struct task_struct *next_p,
+ struct tss_struct *tss)
{
- if (!next->io_bitmap_ptr) {
+ struct thread_struct *next;
+
+ next = &next_p->thread;
+
+ if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
+ set_debugreg(next->debugreg[0], 0);
+ set_debugreg(next->debugreg[1], 1);
+ set_debugreg(next->debugreg[2], 2);
+ set_debugreg(next->debugreg[3], 3);
+ /* no 4 and 5 */
+ set_debugreg(next->debugreg[6], 6);
+ set_debugreg(next->debugreg[7], 7);
+ }
+
+ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Disable the bitmap via an invalid offset. We still cache
* the previous bitmap owner and the IO bitmap contents:
@@ -545,6 +552,7 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
return;
}
+
if (likely(next == tss->io_bitmap_owner)) {
/*
* Previous owner of the bitmap (hence the bitmap content)
@@ -672,20 +680,11 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
set_iopl_mask(next->iopl);
/*
- * Now maybe reload the debug registers
+ * Now maybe handle debug registers and/or IO bitmaps
*/
- if (unlikely(next->debugreg[7])) {
- set_debugreg(next->debugreg[0], 0);
- set_debugreg(next->debugreg[1], 1);
- set_debugreg(next->debugreg[2], 2);
- set_debugreg(next->debugreg[3], 3);
- /* no 4 and 5 */
- set_debugreg(next->debugreg[6], 6);
- set_debugreg(next->debugreg[7], 7);
- }
-
- if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
- handle_io_bitmap(next, tss);
+ if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
+ || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
+ __switch_to_xtra(next_p, tss);
disable_tsc(prev_p, next_p);
@@ -898,7 +897,7 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
unsigned long arch_align_stack(unsigned long sp)
{
- if (randomize_va_space)
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
sp -= get_random_int() % 8192;
return sp & ~0xf;
}
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index fd7eaf7866e0..775f50e9395b 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -185,17 +185,17 @@ static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_
return addr;
}
-static inline int is_at_popf(struct task_struct *child, struct pt_regs *regs)
+static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
{
int i, copied;
- unsigned char opcode[16];
+ unsigned char opcode[15];
unsigned long addr = convert_eip_to_linear(child, regs);
copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
for (i = 0; i < copied; i++) {
switch (opcode[i]) {
- /* popf */
- case 0x9d:
+ /* popf and iret */
+ case 0x9d: case 0xcf:
return 1;
/* opcode and address size prefixes */
case 0x66: case 0x67:
@@ -247,7 +247,7 @@ static void set_singlestep(struct task_struct *child)
* don't mark it as being "us" that set it, so that we
* won't clear it by hand later.
*/
- if (is_at_popf(child, regs))
+ if (is_setting_trap_flag(child, regs))
return;
child->ptrace |= PT_DTRACE;
@@ -468,8 +468,11 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
for(i=0; i<4; i++)
if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
goto out_tsk;
+ if (data)
+ set_tsk_thread_flag(child, TIF_DEBUG);
+ else
+ clear_tsk_thread_flag(child, TIF_DEBUG);
}
-
addr -= (long) &dummy->u_debugreg;
addr = addr >> 2;
child->thread.debugreg[addr] = data;
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 87ccdac84928..9f6ab1789bb0 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -1,7 +1,6 @@
/*
* This file contains work-arounds for x86 and x86_64 platform bugs.
*/
-#include <linux/config.h>
#include <linux/pci.h>
#include <linux/irq.h>
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index d207242976d3..84278e0093a2 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -2,7 +2,6 @@
* linux/arch/i386/kernel/reboot.c
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/delay.h>
@@ -146,14 +145,10 @@ real_mode_gdt_entries [3] =
0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
};
-static struct
-{
- unsigned short size __attribute__ ((packed));
- unsigned long long * base __attribute__ ((packed));
-}
-real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, real_mode_gdt_entries },
-real_mode_idt = { 0x3ff, NULL },
-no_idt = { 0, NULL };
+static struct Xgt_desc_struct
+real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
+real_mode_idt = { 0x3ff, 0 },
+no_idt = { 0, 0 };
/* This is 16-bit protected mode code to disable paging and the cache,
diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S
index d312616effa1..f151d6fae462 100644
--- a/arch/i386/kernel/relocate_kernel.S
+++ b/arch/i386/kernel/relocate_kernel.S
@@ -7,16 +7,138 @@
*/
#include <linux/linkage.h>
+#include <asm/page.h>
+#include <asm/kexec.h>
+
+/*
+ * Must be relocatable PIC code callable as a C function
+ */
+
+#define PTR(x) (x << 2)
+#define PAGE_ALIGNED (1 << PAGE_SHIFT)
+#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+
+ .text
+ .align PAGE_ALIGNED
+ .globl relocate_kernel
+relocate_kernel:
+ movl 8(%esp), %ebp /* list of pages */
+
+#ifdef CONFIG_X86_PAE
+ /* map the control page at its virtual address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xc0000000, %eax
+ shrl $27, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PMD_0)(%ebp), %edx
+ orl $PAE_PGD_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PMD_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x3fe00000, %eax
+ shrl $18, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_0)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x001ff000, %eax
+ shrl $9, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ /* identity map the control page at its physical address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xc0000000, %eax
+ shrl $27, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PMD_1)(%ebp), %edx
+ orl $PAE_PGD_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PMD_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x3fe00000, %eax
+ shrl $18, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_1)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x001ff000, %eax
+ shrl $9, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+#else
+ /* map the control page at its virtual address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xffc00000, %eax
+ shrl $20, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_0)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_0)(%ebp), %edi
+ movl PTR(VA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x003ff000, %eax
+ shrl $10, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ /* identity map the control page at its physical address */
+
+ movl PTR(VA_PGD)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0xffc00000, %eax
+ shrl $20, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_PTE_1)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+
+ movl PTR(VA_PTE_1)(%ebp), %edi
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %eax
+ andl $0x003ff000, %eax
+ shrl $10, %eax
+ addl %edi, %eax
+
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edx
+ orl $PAGE_ATTR, %edx
+ movl %edx, (%eax)
+#endif
- /*
- * Must be relocatable PIC code callable as a C function, that once
- * it starts can not use the previous processes stack.
- */
- .globl relocate_new_kernel
relocate_new_kernel:
/* read the arguments and say goodbye to the stack */
movl 4(%esp), %ebx /* page_list */
- movl 8(%esp), %ebp /* reboot_code_buffer */
+ movl 8(%esp), %ebp /* list of pages */
movl 12(%esp), %edx /* start address */
movl 16(%esp), %ecx /* cpu_has_pae */
@@ -24,11 +146,26 @@ relocate_new_kernel:
pushl $0
popfl
- /* set a new stack at the bottom of our page... */
- lea 4096(%ebp), %esp
+ /* get physical address of control page now */
+ /* this is impossible after page table switch */
+ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
- /* store the parameters back on the stack */
- pushl %edx /* store the start address */
+ /* switch to new set of page tables */
+ movl PTR(PA_PGD)(%ebp), %eax
+ movl %eax, %cr3
+
+ /* setup a new stack at the end of the physical control page */
+ lea 4096(%edi), %esp
+
+ /* jump to identity mapped page */
+ movl %edi, %eax
+ addl $(identity_mapped - relocate_kernel), %eax
+ pushl %eax
+ ret
+
+identity_mapped:
+ /* store the start address on the stack */
+ pushl %edx
/* Set cr0 to a known state:
* 31 0 == Paging disabled
@@ -113,8 +250,3 @@ relocate_new_kernel:
xorl %edi, %edi
xorl %ebp, %ebp
ret
-relocate_new_kernel_end:
-
- .globl relocate_new_kernel_size
-relocate_new_kernel_size:
- .long relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
index 321f5fd26e75..c7d3df23f589 100644
--- a/arch/i386/kernel/scx200.c
+++ b/arch/i386/kernel/scx200.c
@@ -4,11 +4,11 @@
National Semiconductor SCx200 support. */
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/scx200.h>
@@ -45,11 +45,19 @@ static struct pci_driver scx200_pci_driver = {
.probe = scx200_probe,
};
-static DEFINE_SPINLOCK(scx200_gpio_config_lock);
+static DEFINE_MUTEX(scx200_gpio_config_lock);
-static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+static void __devinit scx200_init_shadow(void)
{
int bank;
+
+ /* read the current values driven on the GPIO signals */
+ for (bank = 0; bank < 2; ++bank)
+ scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+}
+
+static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
unsigned base;
if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
@@ -63,10 +71,7 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
}
scx200_gpio_base = base;
-
- /* read the current values driven on the GPIO signals */
- for (bank = 0; bank < 2; ++bank)
- scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+ scx200_init_shadow();
} else {
/* find the base of the Configuration Block */
@@ -87,12 +92,11 @@ static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_
return 0;
}
-u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
+u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
{
u32 config, new_config;
- unsigned long flags;
- spin_lock_irqsave(&scx200_gpio_config_lock, flags);
+ mutex_lock(&scx200_gpio_config_lock);
outl(index, scx200_gpio_base + 0x20);
config = inl(scx200_gpio_base + 0x24);
@@ -100,45 +104,11 @@ u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
new_config = (config & mask) | bits;
outl(new_config, scx200_gpio_base + 0x24);
- spin_unlock_irqrestore(&scx200_gpio_config_lock, flags);
+ mutex_unlock(&scx200_gpio_config_lock);
return config;
}
-#if 0
-void scx200_gpio_dump(unsigned index)
-{
- u32 config = scx200_gpio_configure(index, ~0, 0);
- printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
-
- if (config & 1)
- printk(" OE"); /* output enabled */
- else
- printk(" TS"); /* tristate */
- if (config & 2)
- printk(" PP"); /* push pull */
- else
- printk(" OD"); /* open drain */
- if (config & 4)
- printk(" PUE"); /* pull up enabled */
- else
- printk(" PUD"); /* pull up disabled */
- if (config & 8)
- printk(" LOCKED"); /* locked */
- if (config & 16)
- printk(" LEVEL"); /* level input */
- else
- printk(" EDGE"); /* edge input */
- if (config & 32)
- printk(" HI"); /* trigger on rising edge */
- else
- printk(" LO"); /* trigger on falling edge */
- if (config & 64)
- printk(" DEBOUNCE"); /* debounce */
- printk("\n");
-}
-#endif /* 0 */
-
static int __init scx200_init(void)
{
printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
@@ -159,10 +129,3 @@ EXPORT_SYMBOL(scx200_gpio_base);
EXPORT_SYMBOL(scx200_gpio_shadow);
EXPORT_SYMBOL(scx200_gpio_configure);
EXPORT_SYMBOL(scx200_cb_base);
-
-/*
- Local variables:
- compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
- c-basic-offset: 8
- End:
-*/
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
deleted file mode 100644
index 967dc74df9ee..000000000000
--- a/arch/i386/kernel/semaphore.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <linux/config.h>
-#include <asm/semaphore.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down_interruptible\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "pushl %ebp\n\t"
- "movl %esp,%ebp\n\t"
-#endif
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __down_trylock\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
-#if defined(CONFIG_FRAME_POINTER)
- "movl %ebp,%esp\n\t"
- "popl %ebp\n\t"
-#endif
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
- "pushl %edx\n\t"
- "pushl %ecx\n\t"
- "call __up\n\t"
- "popl %ecx\n\t"
- "popl %edx\n\t"
- "ret"
-);
-
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __write_lock_failed\n"
-"__write_lock_failed:\n\t"
- LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1: rep; nop\n\t"
- "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
- "jne 1b\n\t"
- LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t"
- "jnz __write_lock_failed\n\t"
- "ret"
-);
-
-asm(
-".section .sched.text\n"
-".align 4\n"
-".globl __read_lock_failed\n"
-"__read_lock_failed:\n\t"
- LOCK_PREFIX "incl (%eax)\n"
-"1: rep; nop\n\t"
- "cmpl $1,(%eax)\n\t"
- "js 1b\n\t"
- LOCK_PREFIX "decl (%eax)\n\t"
- "js __read_lock_failed\n\t"
- "ret"
-);
-#endif
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index e6023970aa40..000cf03751fe 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -23,11 +23,10 @@
* This file handles the architecture-dependent parts of initialization
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/tty.h>
+#include <linux/screen_info.h>
#include <linux/ioport.h>
#include <linux/acpi.h>
#include <linux/apm_bios.h>
@@ -48,20 +47,20 @@
#include <linux/crash_dump.h>
#include <linux/dmi.h>
#include <linux/pfn.h>
-#include <linux/suspend.h>
#include <video/edid.h>
#include <asm/apic.h>
#include <asm/e820.h>
#include <asm/mpspec.h>
+#include <asm/mmzone.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
#include <asm/sections.h>
#include <asm/io_apic.h>
#include <asm/ist.h>
#include <asm/io.h>
-#include "setup_arch_pre.h"
+#include <setup_arch.h>
#include <bios_ebda.h>
/* Forward Declaration. */
@@ -91,18 +90,6 @@ EXPORT_SYMBOL(boot_cpu_data);
unsigned long mmu_cr4_features;
-#ifdef CONFIG_ACPI
- int acpi_disabled = 0;
-#else
- int acpi_disabled = 1;
-#endif
-EXPORT_SYMBOL(acpi_disabled);
-
-#ifdef CONFIG_ACPI
-int __initdata acpi_force = 0;
-extern acpi_interrupt_flags acpi_sci_flags;
-#endif
-
/* for MCA, but anyone else can use it if they want */
unsigned int machine_id;
#ifdef CONFIG_MCA
@@ -150,7 +137,6 @@ EXPORT_SYMBOL(ist_info);
struct e820map e820;
extern void early_cpu_init(void);
-extern void generic_apic_probe(char *);
extern int root_mountflags;
unsigned long saved_videomode;
@@ -223,9 +209,6 @@ static struct resource adapter_rom_resources[] = { {
.flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
} };
-#define ADAPTER_ROM_RESOURCES \
- (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
-
static struct resource video_rom_resource = {
.name = "Video ROM",
.start = 0xc0000,
@@ -287,9 +270,6 @@ static struct resource standard_io_resources[] = { {
.flags = IORESOURCE_BUSY | IORESOURCE_IO
} };
-#define STANDARD_IO_RESOURCES \
- (sizeof standard_io_resources / sizeof standard_io_resources[0])
-
#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
static int __init romchecksum(unsigned char *rom, unsigned long length)
@@ -346,7 +326,7 @@ static void __init probe_roms(void)
}
/* check for adapter roms on 2k boundaries */
- for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+ for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
rom = isa_bus_to_virt(start);
if (!romsignature(rom))
continue;
@@ -411,8 +391,8 @@ static void __init limit_regions(unsigned long long size)
}
}
-static void __init add_memory_region(unsigned long long start,
- unsigned long long size, int type)
+void __init add_memory_region(unsigned long long start,
+ unsigned long long size, int type)
{
int x;
@@ -475,7 +455,7 @@ static struct change_member *change_point[2*E820MAX] __initdata;
static struct e820entry *overlap_list[E820MAX] __initdata;
static struct e820entry new_bios[E820MAX] __initdata;
-static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
{
struct change_member *change_tmp;
unsigned long current_type, last_type;
@@ -644,7 +624,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
* thinkpad 560x, for example, does not cooperate with the memory
* detection code.)
*/
-static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
@@ -702,244 +682,150 @@ static inline void copy_edd(void)
}
#endif
+static int __initdata user_defined_memmap = 0;
+
/*
- * Do NOT EVER look at the BIOS memory size location.
- * It does not work on many machines.
+ * "mem=nopentium" disables the 4MB page tables.
+ * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+ * to <mem>, overriding the bios size.
+ * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+ * <start> to <start>+<mem>, overriding the bios size.
+ *
+ * HPA tells me bootloaders need to parse mem=, so no new
+ * option should be mem= [also see Documentation/i386/boot.txt]
*/
-#define LOWMEMSIZE() (0x9f000)
-
-static void __init parse_cmdline_early (char ** cmdline_p)
+static int __init parse_mem(char *arg)
{
- char c = ' ', *to = command_line, *from = saved_command_line;
- int len = 0;
- int userdef = 0;
+ if (!arg)
+ return -EINVAL;
- /* Save unparsed command line copy for /proc/cmdline */
- saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
-
- for (;;) {
- if (c != ' ')
- goto next_char;
- /*
- * "mem=nopentium" disables the 4MB page tables.
- * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
- * to <mem>, overriding the bios size.
- * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
- * <start> to <start>+<mem>, overriding the bios size.
- *
- * HPA tells me bootloaders need to parse mem=, so no new
- * option should be mem= [also see Documentation/i386/boot.txt]
+ if (strcmp(arg, "nopentium") == 0) {
+ clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+ disable_pse = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
*/
- if (!memcmp(from, "mem=", 4)) {
- if (to != command_line)
- to--;
- if (!memcmp(from+4, "nopentium", 9)) {
- from += 9+4;
- clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
- disable_pse = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long mem_size;
-
- mem_size = memparse(from+4, &from);
- limit_regions(mem_size);
- userdef=1;
- }
- }
-
- else if (!memcmp(from, "memmap=", 7)) {
- if (to != command_line)
- to--;
- if (!memcmp(from+7, "exactmap", 8)) {
-#ifdef CONFIG_CRASH_DUMP
- /* If we are doing a crash dump, we
- * still need to know the real mem
- * size before original memory map is
- * reset.
- */
- find_max_pfn();
- saved_max_pfn = max_pfn;
-#endif
- from += 8+7;
- e820.nr_map = 0;
- userdef = 1;
- } else {
- /* If the user specifies memory size, we
- * limit the BIOS-provided memory map to
- * that size. exactmap can be used to specify
- * the exact map. mem=number can be used to
- * trim the existing memory map.
- */
- unsigned long long start_at, mem_size;
+ unsigned long long mem_size;
- mem_size = memparse(from+7, &from);
- if (*from == '@') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_RAM);
- } else if (*from == '#') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_ACPI);
- } else if (*from == '$') {
- start_at = memparse(from+1, &from);
- add_memory_region(start_at, mem_size, E820_RESERVED);
- } else {
- limit_regions(mem_size);
- userdef=1;
- }
- }
- }
-
- else if (!memcmp(from, "noexec=", 7))
- noexec_setup(from + 7);
+ mem_size = memparse(arg, &arg);
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
+ }
+ return 0;
+}
+early_param("mem", parse_mem);
+static int __init parse_memmap(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_X86_SMP
- /*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
+ if (strcmp(arg, "exactmap") == 0) {
+#ifdef CONFIG_CRASH_DUMP
+ /* If we are doing a crash dump, we
+ * still need to know the real mem
+ * size before original memory map is
+ * reset.
*/
- else if (!memcmp(from, "maxcpus=", 8)) {
- extern unsigned int maxcpus;
-
- maxcpus = simple_strtoul(from + 8, NULL, 0);
- }
+ find_max_pfn();
+ saved_max_pfn = max_pfn;
#endif
-
-#ifdef CONFIG_ACPI
- /* "acpi=off" disables both ACPI table parsing and interpreter */
- else if (!memcmp(from, "acpi=off", 8)) {
- disable_acpi();
- }
-
- /* acpi=force to over-ride black-list */
- else if (!memcmp(from, "acpi=force", 10)) {
- acpi_force = 1;
- acpi_ht = 1;
- acpi_disabled = 0;
- }
-
- /* acpi=strict disables out-of-spec workarounds */
- else if (!memcmp(from, "acpi=strict", 11)) {
- acpi_strict = 1;
- }
-
- /* Limit ACPI just to boot-time to enable HT */
- else if (!memcmp(from, "acpi=ht", 7)) {
- if (!acpi_force)
- disable_acpi();
- acpi_ht = 1;
- }
-
- /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
- else if (!memcmp(from, "pci=noacpi", 10)) {
- acpi_disable_pci();
- }
- /* "acpi=noirq" disables ACPI interrupt routing */
- else if (!memcmp(from, "acpi=noirq", 10)) {
- acpi_noirq_set();
+ e820.nr_map = 0;
+ user_defined_memmap = 1;
+ } else {
+ /* If the user specifies memory size, we
+ * limit the BIOS-provided memory map to
+ * that size. exactmap can be used to specify
+ * the exact map. mem=number can be used to
+ * trim the existing memory map.
+ */
+ unsigned long long start_at, mem_size;
+
+ mem_size = memparse(arg, &arg);
+ if (*arg == '@') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RAM);
+ } else if (*arg == '#') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_ACPI);
+ } else if (*arg == '$') {
+ start_at = memparse(arg+1, &arg);
+ add_memory_region(start_at, mem_size, E820_RESERVED);
+ } else {
+ limit_regions(mem_size);
+ user_defined_memmap = 1;
}
+ }
+ return 0;
+}
+early_param("memmap", parse_memmap);
- else if (!memcmp(from, "acpi_sci=edge", 13))
- acpi_sci_flags.trigger = 1;
-
- else if (!memcmp(from, "acpi_sci=level", 14))
- acpi_sci_flags.trigger = 3;
-
- else if (!memcmp(from, "acpi_sci=high", 13))
- acpi_sci_flags.polarity = 1;
+#ifdef CONFIG_PROC_VMCORE
+/* elfcorehdr= specifies the location of elf core header
+ * stored by the crashed kernel.
+ */
+static int __init parse_elfcorehdr(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
- else if (!memcmp(from, "acpi_sci=low", 12))
- acpi_sci_flags.polarity = 3;
+ elfcorehdr_addr = memparse(arg, &arg);
+ return 0;
+}
+early_param("elfcorehdr", parse_elfcorehdr);
+#endif /* CONFIG_PROC_VMCORE */
-#ifdef CONFIG_X86_IO_APIC
- else if (!memcmp(from, "acpi_skip_timer_override", 24))
- acpi_skip_timer_override = 1;
+/*
+ * highmem=size forces highmem to be exactly 'size' bytes.
+ * This works even on boxes that have no highmem otherwise.
+ * This also works to reduce highmem size on bigger boxes.
+ */
+static int __init parse_highmem(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
- if (!memcmp(from, "disable_timer_pin_1", 19))
- disable_timer_pin_1 = 1;
- if (!memcmp(from, "enable_timer_pin_1", 18))
- disable_timer_pin_1 = -1;
+ highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
+ return 0;
+}
+early_param("highmem", parse_highmem);
- /* disable IO-APIC */
- else if (!memcmp(from, "noapic", 6))
- disable_ioapic_setup();
-#endif /* CONFIG_X86_IO_APIC */
-#endif /* CONFIG_ACPI */
+/*
+ * vmalloc=size forces the vmalloc area to be exactly 'size'
+ * bytes. This can be used to increase (or decrease) the
+ * vmalloc area - the default is 128m.
+ */
+static int __init parse_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_X86_LOCAL_APIC
- /* enable local APIC */
- else if (!memcmp(from, "lapic", 5))
- lapic_enable();
+ __VMALLOC_RESERVE = memparse(arg, &arg);
+ return 0;
+}
+early_param("vmalloc", parse_vmalloc);
- /* disable local APIC */
- else if (!memcmp(from, "nolapic", 6))
- lapic_disable();
-#endif /* CONFIG_X86_LOCAL_APIC */
+/*
+ * reservetop=size reserves a hole at the top of the kernel address space which
+ * a hypervisor can load into later. Needed for dynamically loaded hypervisors,
+ * so relocating the fixmap can be done before paging initialization.
+ */
+static int __init parse_reservetop(char *arg)
+{
+ unsigned long address;
-#ifdef CONFIG_KEXEC
- /* crashkernel=size@addr specifies the location to reserve for
- * a crash kernel. By reserving this memory we guarantee
- * that linux never set's it up as a DMA target.
- * Useful for holding code to do something appropriate
- * after a kernel panic.
- */
- else if (!memcmp(from, "crashkernel=", 12)) {
- unsigned long size, base;
- size = memparse(from+12, &from);
- if (*from == '@') {
- base = memparse(from+1, &from);
- /* FIXME: Do I want a sanity check
- * to validate the memory range?
- */
- crashk_res.start = base;
- crashk_res.end = base + size - 1;
- }
- }
-#endif
-#ifdef CONFIG_PROC_VMCORE
- /* elfcorehdr= specifies the location of elf core header
- * stored by the crashed kernel.
- */
- else if (!memcmp(from, "elfcorehdr=", 11))
- elfcorehdr_addr = memparse(from+11, &from);
-#endif
+ if (!arg)
+ return -EINVAL;
- /*
- * highmem=size forces highmem to be exactly 'size' bytes.
- * This works even on boxes that have no highmem otherwise.
- * This also works to reduce highmem size on bigger boxes.
- */
- else if (!memcmp(from, "highmem=", 8))
- highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
-
- /*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the
- * vmalloc area - the default is 128m.
- */
- else if (!memcmp(from, "vmalloc=", 8))
- __VMALLOC_RESERVE = memparse(from+8, &from);
-
- next_char:
- c = *(from++);
- if (!c)
- break;
- if (COMMAND_LINE_SIZE <= ++len)
- break;
- *(to++) = c;
- }
- *to = '\0';
- *cmdline_p = command_line;
- if (userdef) {
- printk(KERN_INFO "user-defined physical RAM map:\n");
- print_memory_map("user");
- }
+ address = memparse(arg, &arg);
+ reserve_top_address(address);
+ return 0;
}
+early_param("reservetop", parse_reservetop);
/*
* Callback for efi_memory_walk.
@@ -1178,6 +1064,14 @@ static unsigned long __init setup_memory(void)
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
+ num_physpages = highend_pfn;
+ high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+ num_physpages = max_low_pfn;
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+#ifdef CONFIG_FLATMEM
+ max_mapnr = num_physpages;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
@@ -1189,22 +1083,20 @@ static unsigned long __init setup_memory(void)
void __init zone_sizes_init(void)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
- unsigned int max_dma, low;
-
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
- low = max_low_pfn;
-
- if (low < max_dma)
- zones_size[ZONE_DMA] = low;
- else {
- zones_size[ZONE_DMA] = max_dma;
- zones_size[ZONE_NORMAL] = low - max_dma;
#ifdef CONFIG_HIGHMEM
- zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+ max_low_pfn,
+ highend_pfn};
+ add_active_range(0, 0, highend_pfn);
+#else
+ unsigned long max_zone_pfns[MAX_NR_ZONES] = {
+ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT,
+ max_low_pfn};
+ add_active_range(0, 0, max_low_pfn);
#endif
- }
- free_area_init(zones_size);
+
+ free_area_init_nodes(max_zone_pfns);
}
#else
extern unsigned long __init setup_memory(void);
@@ -1266,7 +1158,7 @@ void __init setup_bootmem_allocator(void)
*/
find_smp_config();
#endif
-
+ numa_kva_reserve();
#ifdef CONFIG_BLK_DEV_INITRD
if (LOADER_TYPE && INITRD_START) {
if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
@@ -1321,8 +1213,10 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
probe_roms();
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
+#ifndef CONFIG_RESOURCES_64BIT
if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
continue;
+#endif
res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
@@ -1333,7 +1227,10 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
res->start = e820.map[i].addr;
res->end = res->start + e820.map[i].size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- request_resource(&iomem_resource, res);
+ if (request_resource(&iomem_resource, res)) {
+ kfree(res);
+ continue;
+ }
if (e820.map[i].type == E820_RAM) {
/*
* We don't know which RAM region contains kernel data,
@@ -1369,7 +1266,7 @@ static int __init request_standard_resources(void)
request_resource(&iomem_resource, &video_ram_resource);
/* request I/O space for devices used on all i[345]86 PCs */
- for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
request_resource(&ioport_resource, &standard_io_resources[i]);
return 0;
}
@@ -1424,8 +1321,6 @@ static void __init register_memory(void)
pci_mem_start, gapstart, gapsize);
}
-static char * __init machine_specific_memory_setup(void);
-
#ifdef CONFIG_MCA
static void set_mca_bus(int x)
{
@@ -1435,111 +1330,6 @@ static void set_mca_bus(int x)
static void set_mca_bus(int x) { }
#endif
-#ifdef CONFIG_SOFTWARE_SUSPEND
-static void __init mark_nosave_page_range(unsigned long start, unsigned long end)
-{
- struct page *page;
- while (start <= end) {
- page = pfn_to_page(start);
- SetPageNosave(page);
- start++;
- }
-}
-
-static void __init e820_nosave_reserved_pages(void)
-{
- int i;
- unsigned long r_start = 0, r_end = 0;
-
- /* Assume e820 map is sorted */
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = PFN_DOWN(ei->addr);
- end = PFN_UP(ei->addr + ei->size);
- if (start >= end)
- continue;
- if (ei->type == E820_RESERVED)
- continue;
- r_end = start;
- /*
- * Highmem 'Reserved' pages are marked as reserved, swsusp
- * will not save/restore them, so we ignore these pages here.
- */
- if (r_end > max_low_pfn)
- r_end = max_low_pfn;
- if (r_end > r_start)
- mark_nosave_page_range(r_start, r_end-1);
- if (r_end >= max_low_pfn)
- break;
- r_start = end;
- }
-}
-
-static void __init e820_save_acpi_pages(void)
-{
- int i;
-
- /* Assume e820 map is sorted */
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- unsigned long start, end;
-
- start = ei->addr;
- end = ei->addr + ei->size;
- if (start >= end)
- continue;
- if (ei->type != E820_ACPI && ei->type != E820_NVS)
- continue;
- /*
- * If the region is below max_low_pfn, it will be
- * saved/restored by swsusp follow 'RAM' type.
- */
- if (start < (max_low_pfn << PAGE_SHIFT))
- start = max_low_pfn << PAGE_SHIFT;
- /*
- * Highmem pages (ACPI NVS/Data) are reserved, but swsusp
- * highmem save/restore will not save/restore them. We marked
- * them as arch saveable pages here
- */
- if (end > start)
- swsusp_add_arch_pages(start, end);
- }
-}
-
-extern char __start_rodata, __end_rodata;
-/*
- * BIOS reserved region/hole - no save/restore
- * ACPI NVS - save/restore
- * ACPI Data - this is a little tricky, the mem could be used by OS after OS
- * reads tables from the region, but anyway save/restore the memory hasn't any
- * side effect and Linux runtime module load/unload might use it.
- * kernel rodata - no save/restore (kernel rodata isn't changed)
- */
-static int __init mark_nosave_pages(void)
-{
- unsigned long pfn_start, pfn_end;
-
- /* FIXME: provide a version for efi BIOS */
- if (efi_enabled)
- return 0;
- /* BIOS reserved regions & holes */
- e820_nosave_reserved_pages();
-
- /* kernel rodata */
- pfn_start = PFN_UP(virt_to_phys(&__start_rodata));
- pfn_end = PFN_DOWN(virt_to_phys(&__end_rodata));
- mark_nosave_page_range(pfn_start, pfn_end-1);
-
- /* record ACPI Data/NVS as saveable */
- e820_save_acpi_pages();
-
- return 0;
-}
-core_initcall(mark_nosave_pages);
-#endif
-
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -1609,17 +1399,15 @@ void __init setup_arch(char **cmdline_p)
data_resource.start = virt_to_phys(_etext);
data_resource.end = virt_to_phys(_edata)-1;
- parse_cmdline_early(cmdline_p);
+ parse_early_param();
-#ifdef CONFIG_EARLY_PRINTK
- {
- char *s = strstr(*cmdline_p, "earlyprintk=");
- if (s) {
- setup_early_printk(strchr(s, '=') + 1);
- printk("early console enabled\n");
- }
+ if (user_defined_memmap) {
+ printk(KERN_INFO "user-defined physical RAM map:\n");
+ print_memory_map("user");
}
-#endif
+
+ strlcpy(command_line, saved_command_line, COMMAND_LINE_SIZE);
+ *cmdline_p = command_line;
max_low_pfn = setup_memory();
@@ -1648,7 +1436,7 @@ void __init setup_arch(char **cmdline_p)
dmi_scan_machine();
#ifdef CONFIG_X86_GENERICARCH
- generic_apic_probe(*cmdline_p);
+ generic_apic_probe();
#endif
if (efi_enabled)
efi_map_memmap();
@@ -1660,9 +1448,11 @@ void __init setup_arch(char **cmdline_p)
acpi_boot_table_init();
#endif
+#ifdef CONFIG_PCI
#ifdef CONFIG_X86_IO_APIC
check_acpi_pci(); /* Checks more than just ACPI actually */
#endif
+#endif
#ifdef CONFIG_ACPI
acpi_boot_init();
@@ -1689,6 +1479,7 @@ void __init setup_arch(char **cmdline_p)
conswitchp = &dummy_con;
#endif
#endif
+ tsc_init();
}
static __init int add_pcspkr(void)
@@ -1708,7 +1499,6 @@ static __init int add_pcspkr(void)
}
device_initcall(add_pcspkr);
-#include "setup_arch_post.h"
/*
* Local Variables:
* mode:c
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 5c352c3a9e7f..43002cfb40c4 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -351,7 +351,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
goto give_sigsegv;
}
- restorer = &__kernel_sigreturn;
+ restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
@@ -447,7 +447,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
goto give_sigsegv;
/* Set up to return from userspace. */
- restorer = &__kernel_rt_sigreturn;
+ restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn);
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index d134e9643a58..465188e2d701 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -114,7 +114,17 @@ DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_m
static inline int __prepare_ICR (unsigned int shortcut, int vector)
{
- return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL;
+ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
}
static inline int __prepare_ICR2 (unsigned int mask)
@@ -624,3 +634,69 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
}
}
+/*
+ * this function sends a 'generic call function' IPI to one other CPU
+ * in the system.
+ *
+ * cpu is a standard Linux logical CPU number.
+ */
+static void
+__smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = 1;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (!wait)
+ return;
+
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+/*
+ * smp_call_function_single - Run a function on another CPU
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Currently unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Retrurns 0 on success, else a negative status code.
+ *
+ * Does not return until the remote CPU is nearly ready to execute <func>
+ * or is or has executed.
+ */
+
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int me = get_cpu();
+ if (cpu == me) {
+ WARN_ON(1);
+ put_cpu();
+ return -EBUSY;
+ }
+ spin_lock_bh(&call_lock);
+ __smp_call_function_single(cpu, func, info, nonatomic, wait);
+ spin_unlock_bh(&call_lock);
+ put_cpu();
+ return 0;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 825b2b4ca721..82b26d5ce476 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -34,7 +34,6 @@
* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
#include <linux/module.h>
-#include <linux/config.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -52,6 +51,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/arch_hooks.h>
+#include <asm/nmi.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
@@ -66,12 +66,6 @@ int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
#endif
-/* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
-/* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
-
/* Last level cache ID of each logical CPU */
int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
@@ -108,6 +102,8 @@ u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
{ [0 ... NR_CPUS-1] = 0xff };
EXPORT_SYMBOL(x86_cpu_to_apicid);
+u8 apicid_2_node[MAX_APICID];
+
/*
* Trampoline 80x86 program as an array.
*/
@@ -183,6 +179,9 @@ static void __devinit smp_store_cpu_info(int id)
*/
if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+ if (num_possible_cpus() == 1)
+ goto valid_k7;
+
/* Athlon 660/661 is valid. */
if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
goto valid_k7;
@@ -218,14 +217,20 @@ valid_k7:
* then we print a warning if not, and always resync.
*/
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
+static struct {
+ atomic_t start_flag;
+ atomic_t count_start;
+ atomic_t count_stop;
+ unsigned long long values[NR_CPUS];
+} tsc __initdata = {
+ .start_flag = ATOMIC_INIT(0),
+ .count_start = ATOMIC_INIT(0),
+ .count_stop = ATOMIC_INIT(0),
+};
#define NR_LOOPS 5
-static void __init synchronize_tsc_bp (void)
+static void __init synchronize_tsc_bp(void)
{
int i;
unsigned long long t0;
@@ -239,7 +244,7 @@ static void __init synchronize_tsc_bp (void)
/* convert from kcyc/sec to cyc/usec */
one_usec = cpu_khz / 1000;
- atomic_set(&tsc_start_flag, 1);
+ atomic_set(&tsc.start_flag, 1);
wmb();
/*
@@ -256,16 +261,16 @@ static void __init synchronize_tsc_bp (void)
/*
* all APs synchronize but they loop on '== num_cpus'
*/
- while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_stop, 0);
+ while (atomic_read(&tsc.count_start) != num_booting_cpus()-1)
+ cpu_relax();
+ atomic_set(&tsc.count_stop, 0);
wmb();
/*
* this lets the APs save their current TSC:
*/
- atomic_inc(&tsc_count_start);
+ atomic_inc(&tsc.count_start);
- rdtscll(tsc_values[smp_processor_id()]);
+ rdtscll(tsc.values[smp_processor_id()]);
/*
* We clear the TSC in the last loop:
*/
@@ -275,56 +280,54 @@ static void __init synchronize_tsc_bp (void)
/*
* Wait for all APs to leave the synchronization point:
*/
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
- mb();
- atomic_set(&tsc_count_start, 0);
+ while (atomic_read(&tsc.count_stop) != num_booting_cpus()-1)
+ cpu_relax();
+ atomic_set(&tsc.count_start, 0);
wmb();
- atomic_inc(&tsc_count_stop);
+ atomic_inc(&tsc.count_stop);
}
sum = 0;
for (i = 0; i < NR_CPUS; i++) {
if (cpu_isset(i, cpu_callout_map)) {
- t0 = tsc_values[i];
+ t0 = tsc.values[i];
sum += t0;
}
}
avg = sum;
do_div(avg, num_booting_cpus());
- sum = 0;
for (i = 0; i < NR_CPUS; i++) {
if (!cpu_isset(i, cpu_callout_map))
continue;
- delta = tsc_values[i] - avg;
+ delta = tsc.values[i] - avg;
if (delta < 0)
delta = -delta;
/*
* We report bigger than 2 microseconds clock differences.
*/
if (delta > 2*one_usec) {
- long realdelta;
+ long long realdelta;
+
if (!buggy) {
buggy = 1;
printk("\n");
}
realdelta = delta;
do_div(realdelta, one_usec);
- if (tsc_values[i] < avg)
+ if (tsc.values[i] < avg)
realdelta = -realdelta;
- if (realdelta > 0)
- printk(KERN_INFO "CPU#%d had %ld usecs TSC "
+ if (realdelta)
+ printk(KERN_INFO "CPU#%d had %Ld usecs TSC "
"skew, fixed it up.\n", i, realdelta);
}
-
- sum += delta;
}
if (!buggy)
printk("passed.\n");
}
-static void __init synchronize_tsc_ap (void)
+static void __init synchronize_tsc_ap(void)
{
int i;
@@ -333,19 +336,21 @@ static void __init synchronize_tsc_ap (void)
* this gets called, so we first wait for the BP to
* finish SMP initialization:
*/
- while (!atomic_read(&tsc_start_flag)) mb();
+ while (!atomic_read(&tsc.start_flag))
+ cpu_relax();
for (i = 0; i < NR_LOOPS; i++) {
- atomic_inc(&tsc_count_start);
- while (atomic_read(&tsc_count_start) != num_booting_cpus())
- mb();
+ atomic_inc(&tsc.count_start);
+ while (atomic_read(&tsc.count_start) != num_booting_cpus())
+ cpu_relax();
- rdtscll(tsc_values[smp_processor_id()]);
+ rdtscll(tsc.values[smp_processor_id()]);
if (i == NR_LOOPS-1)
write_tsc(0, 0);
- atomic_inc(&tsc_count_stop);
- while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+ atomic_inc(&tsc.count_stop);
+ while (atomic_read(&tsc.count_stop) != num_booting_cpus())
+ cpu_relax();
}
}
#undef NR_LOOPS
@@ -451,10 +456,12 @@ cpumask_t cpu_coregroup_map(int cpu)
struct cpuinfo_x86 *c = cpu_data + cpu;
/*
* For perf, we return last level cache shared map.
- * TBD: when power saving sched policy is added, we will return
- * cpu_core_map when power saving policy is enabled
+ * And for power savings, we return cpu_core_map
*/
- return c->llc_shared_map;
+ if (sched_mc_power_savings || sched_smt_power_savings)
+ return cpu_core_map[cpu];
+ else
+ return c->llc_shared_map;
}
/* representing cpus for which sibling maps can be computed */
@@ -470,8 +477,8 @@ set_cpu_sibling_map(int cpu)
if (smp_num_siblings > 1) {
for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (phys_proc_id[cpu] == phys_proc_id[i] &&
- cpu_core_id[cpu] == cpu_core_id[i]) {
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id &&
+ c[cpu].cpu_core_id == c[i].cpu_core_id) {
cpu_set(i, cpu_sibling_map[cpu]);
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
@@ -498,7 +505,7 @@ set_cpu_sibling_map(int cpu)
cpu_set(i, c[cpu].llc_shared_map);
cpu_set(cpu, c[i].llc_shared_map);
}
- if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ if (c[cpu].phys_proc_id == c[i].phys_proc_id) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
/*
@@ -640,9 +647,13 @@ static void map_cpu_to_logical_apicid(void)
{
int cpu = smp_processor_id();
int apicid = logical_smp_processor_id();
+ int node = apicid_to_node(hard_smp_processor_id());
+
+ if (!node_online(node))
+ node = first_online_node;
cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, apicid_to_node(apicid));
+ map_cpu_to_node(cpu, node);
}
static void unmap_cpu_to_logical_apicid(int cpu)
@@ -945,6 +956,7 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
irq_ctx_init(cpu);
+ x86_cpu_to_apicid[cpu] = apicid;
/*
* This grunge runs the startup process for
* the targeted processor.
@@ -1053,6 +1065,7 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
struct warm_boot_cpu_info info;
struct work_struct task;
int apicid, ret;
+ struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
apicid = x86_cpu_to_apicid[cpu];
if (apicid == BAD_APICID) {
@@ -1060,6 +1073,18 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
goto exit;
}
+ /*
+ * the CPU isn't initialized at boot time, allocate gdt table here.
+ * cpu_init will initialize it
+ */
+ if (!cpu_gdt_descr->address) {
+ cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
+ if (!cpu_gdt_descr->address)
+ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
+ ret = -ENOMEM;
+ goto exit;
+ }
+
info.complete = &done;
info.apicid = apicid;
info.cpu = cpu;
@@ -1337,8 +1362,8 @@ remove_siblinginfo(int cpu)
cpu_clear(cpu, cpu_sibling_map[sibling]);
cpus_clear(cpu_sibling_map[cpu]);
cpus_clear(cpu_core_map[cpu]);
- phys_proc_id[cpu] = BAD_APICID;
- cpu_core_id[cpu] = BAD_APICID;
+ c[cpu].phys_proc_id = 0;
+ c[cpu].cpu_core_id = 0;
cpu_clear(cpu, cpu_sibling_setup_map);
}
@@ -1357,7 +1382,8 @@ int __cpu_disable(void)
*/
if (cpu == 0)
return -EBUSY;
-
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
/* Allow any queued timer interrupts to get serviced */
local_irq_enable();
@@ -1433,7 +1459,7 @@ int __devinit __cpu_up(unsigned int cpu)
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
- mb();
+ cpu_relax();
return 0;
}
@@ -1471,3 +1497,16 @@ void __init smp_intr_init(void)
/* IPI for generic function call */
set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
+
+/*
+ * If the BIOS enumerates physical processors before logical,
+ * maxcpus=N at enumeration-time can be used to disable HT.
+ */
+static int __init parse_maxcpus(char *arg)
+{
+ extern unsigned int maxcpus;
+
+ maxcpus = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+early_param("maxcpus", parse_maxcpus);
diff --git a/arch/i386/kernel/srat.c b/arch/i386/kernel/srat.c
index 989c85255dbe..f7e735c077c3 100644
--- a/arch/i386/kernel/srat.c
+++ b/arch/i386/kernel/srat.c
@@ -23,7 +23,6 @@
*
* Send feedback to Pat Gaughen <gone@us.ibm.com>
*/
-#include <linux/config.h>
#include <linux/mm.h>
#include <linux/bootmem.h>
#include <linux/mmzone.h>
@@ -31,6 +30,7 @@
#include <linux/nodemask.h>
#include <asm/srat.h>
#include <asm/topology.h>
+#include <asm/smp.h>
/*
* proximity macros and definitions
@@ -43,7 +43,7 @@
#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
-#define MAX_CHUNKS_PER_NODE 4
+#define MAX_CHUNKS_PER_NODE 3
#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
struct node_memory_chunk_s {
unsigned long start_pfn;
@@ -55,8 +55,7 @@ struct node_memory_chunk_s {
static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS];
static int num_memory_chunks; /* total number of memory chunks */
-static int zholes_size_init;
-static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES];
+static u8 __initdata apicid_to_pxm[MAX_APICID];
extern void * boot_ioremap(unsigned long, unsigned long);
@@ -72,6 +71,8 @@ static void __init parse_cpu_affinity_structure(char *p)
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain);
+ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain;
+
printk("CPU 0x%02X in proximity domain 0x%02X\n",
cpu_affinity->apic_id, cpu_affinity->proximity_domain);
}
@@ -136,50 +137,6 @@ static void __init parse_memory_affinity_structure (char *sratp)
"enabled and removable" : "enabled" ) );
}
-#if MAX_NR_ZONES != 4
-#error "MAX_NR_ZONES != 4, chunk_to_zone requires review"
-#endif
-/* Take a chunk of pages from page frame cstart to cend and count the number
- * of pages in each zone, returned via zones[].
- */
-static __init void chunk_to_zones(unsigned long cstart, unsigned long cend,
- unsigned long *zones)
-{
- unsigned long max_dma;
- extern unsigned long max_low_pfn;
-
- int z;
- unsigned long rend;
-
- /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide
- * similarly scoped information and should be handled in a consistant
- * manner.
- */
- max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
- /* Split the hole into the zones in which it falls. Repeatedly
- * take the segment in which the remaining hole starts, round it
- * to the end of that zone.
- */
- memset(zones, 0, MAX_NR_ZONES * sizeof(long));
- while (cstart < cend) {
- if (cstart < max_dma) {
- z = ZONE_DMA;
- rend = (cend < max_dma)? cend : max_dma;
-
- } else if (cstart < max_low_pfn) {
- z = ZONE_NORMAL;
- rend = (cend < max_low_pfn)? cend : max_low_pfn;
-
- } else {
- z = ZONE_HIGHMEM;
- rend = cend;
- }
- zones[z] += rend - cstart;
- cstart = rend;
- }
-}
-
/*
* The SRAT table always lists ascending addresses, so can always
* assume that the first "start" address that you see is the real
@@ -224,7 +181,6 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */
memset(node_memory_chunk, 0, sizeof(node_memory_chunk));
- memset(zholes_size, 0, sizeof(zholes_size));
num_memory_chunks = 0;
while (p < end) {
@@ -283,11 +239,15 @@ static int __init acpi20_parse_srat(struct acpi_table_srat *sratp)
printk("Number of logical nodes in system = %d\n", num_online_nodes());
printk("Number of memory chunks in system = %d\n", num_memory_chunks);
+ for (i = 0; i < MAX_APICID; i++)
+ apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
+
for (j = 0; j < num_memory_chunks; j++){
struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
node_read_chunk(chunk->nid, chunk);
+ add_active_range(chunk->nid, chunk->start_pfn, chunk->end_pfn);
}
for_each_online_node(nid) {
@@ -396,57 +356,7 @@ int __init get_memcfg_from_srat(void)
return acpi20_parse_srat((struct acpi_table_srat *)header);
}
out_err:
+ remove_all_active_ranges();
printk("failed to get NUMA memory information from SRAT table\n");
return 0;
}
-
-/* For each node run the memory list to determine whether there are
- * any memory holes. For each hole determine which ZONE they fall
- * into.
- *
- * NOTE#1: this requires knowledge of the zone boundries and so
- * _cannot_ be performed before those are calculated in setup_memory.
- *
- * NOTE#2: we rely on the fact that the memory chunks are ordered by
- * start pfn number during setup.
- */
-static void __init get_zholes_init(void)
-{
- int nid;
- int c;
- int first;
- unsigned long end = 0;
-
- for_each_online_node(nid) {
- first = 1;
- for (c = 0; c < num_memory_chunks; c++){
- if (node_memory_chunk[c].nid == nid) {
- if (first) {
- end = node_memory_chunk[c].end_pfn;
- first = 0;
-
- } else {
- /* Record any gap between this chunk
- * and the previous chunk on this node
- * against the zones it spans.
- */
- chunk_to_zones(end,
- node_memory_chunk[c].start_pfn,
- &zholes_size[nid * MAX_NR_ZONES]);
- }
- }
- }
- }
-}
-
-unsigned long * __init get_zholes_size(int nid)
-{
- if (!zholes_size_init) {
- zholes_size_init++;
- get_zholes_init();
- }
- if (nid >= MAX_NUMNODES || !node_online(nid))
- printk("%s: nid = %d is invalid/offline. num_online_nodes = %d",
- __FUNCTION__, nid, num_online_nodes());
- return &zholes_size[nid * MAX_NR_ZONES];
-}
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index dd63d4775398..7e639f78b0b9 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -317,3 +317,4 @@ ENTRY(sys_call_table)
.long sys_tee /* 315 */
.long sys_vmsplice
.long sys_move_pages
+ .long sys_getcpu
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 0bada1870bdf..713ba39d32c6 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -2,6 +2,8 @@
* linux/arch/i386/kernel/sysenter.c
*
* (C) Copyright 2002 Linus Torvalds
+ * Portions based on the vdso-randomization code from exec-shield:
+ * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar
*
* This file contains the needed initializations to support sysenter.
*/
@@ -13,12 +15,31 @@
#include <linux/gfp.h>
#include <linux/string.h>
#include <linux/elf.h>
+#include <linux/mm.h>
+#include <linux/module.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
#include <asm/pgtable.h>
#include <asm/unistd.h>
+/*
+ * Should the kernel map a VDSO page into processes and pass its
+ * address down to glibc upon exec()?
+ */
+unsigned int __read_mostly vdso_enabled = 1;
+
+EXPORT_SYMBOL_GPL(vdso_enabled);
+
+static int __init vdso_setup(char *s)
+{
+ vdso_enabled = simple_strtoul(s, NULL, 0);
+
+ return 1;
+}
+
+__setup("vdso=", vdso_setup);
+
extern asmlinkage void sysenter_entry(void);
void enable_sep_cpu(void)
@@ -45,23 +66,120 @@ void enable_sep_cpu(void)
*/
extern const char vsyscall_int80_start, vsyscall_int80_end;
extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;
int __init sysenter_setup(void)
{
- void *page = (void *)get_zeroed_page(GFP_ATOMIC);
+ syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
- __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+#ifdef CONFIG_COMPAT_VDSO
+ __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY);
+ printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
+#else
+ /*
+ * In the non-compat case the ELF coredumping code needs the fixmap:
+ */
+ __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_KERNEL_RO);
+#endif
if (!boot_cpu_has(X86_FEATURE_SEP)) {
- memcpy(page,
+ memcpy(syscall_page,
&vsyscall_int80_start,
&vsyscall_int80_end - &vsyscall_int80_start);
return 0;
}
- memcpy(page,
+ memcpy(syscall_page,
&vsyscall_sysenter_start,
&vsyscall_sysenter_end - &vsyscall_sysenter_start);
return 0;
}
+
+static struct page *syscall_nopage(struct vm_area_struct *vma,
+ unsigned long adr, int *type)
+{
+ struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+ get_page(p);
+ return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+ .close = syscall_vma_close,
+ .nopage = syscall_nopage,
+};
+
+/* Defined in vsyscall-sysenter.S */
+extern void SYSENTER_RETURN;
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ unsigned long addr;
+ int ret;
+
+ down_write(&mm->mmap_sem);
+ addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
+ if (IS_ERR_VALUE(addr)) {
+ ret = addr;
+ goto up_fail;
+ }
+
+ vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL);
+ if (!vma) {
+ ret = -ENOMEM;
+ goto up_fail;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = addr + PAGE_SIZE;
+ /* MAYWRITE to allow gdb to COW and set breakpoints */
+ vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+ vma->vm_flags |= mm->def_flags;
+ vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+ vma->vm_ops = &syscall_vm_ops;
+ vma->vm_mm = mm;
+
+ ret = insert_vm_struct(mm, vma);
+ if (unlikely(ret)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ goto up_fail;
+ }
+
+ current->mm->context.vdso = (void *)addr;
+ current_thread_info()->sysenter_return =
+ (void *)VDSO_SYM(&SYSENTER_RETURN);
+ mm->total_vm++;
+up_fail:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+ return "[vdso]";
+ return NULL;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+ return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+ return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+ return 0;
+}
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9d3074759856..86944acfb647 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -60,7 +60,6 @@
#include "mach_time.h"
#include <linux/timex.h>
-#include <linux/config.h>
#include <asm/hpet.h>
@@ -82,13 +81,6 @@ extern unsigned long wall_jiffies;
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
-#include <asm/i8253.h>
-
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-struct timer_opts *cur_timer __read_mostly = &timer_none;
-
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
@@ -118,99 +110,19 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
}
EXPORT_SYMBOL(rtc_cmos_write);
-/*
- * This version of gettimeofday has microsecond resolution
- * and better than microsecond precision on fast x86 machines with TSC.
- */
-void do_gettimeofday(struct timeval *tv)
-{
- unsigned long seq;
- unsigned long usec, sec;
- unsigned long max_ntp_tick;
-
- do {
- unsigned long lost;
-
- seq = read_seqbegin(&xtime_lock);
-
- usec = cur_timer->get_offset();
- lost = jiffies - wall_jiffies;
-
- /*
- * If time_adjust is negative then NTP is slowing the clock
- * so make sure not to go into next possible interval.
- * Better to lose some accuracy than have time go backwards..
- */
- if (unlikely(time_adjust < 0)) {
- max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
- usec = min(usec, max_ntp_tick);
-
- if (lost)
- usec += lost * max_ntp_tick;
- }
- else if (unlikely(lost))
- usec += lost * (USEC_PER_SEC / HZ);
-
- sec = xtime.tv_sec;
- usec += (xtime.tv_nsec / 1000);
- } while (read_seqretry(&xtime_lock, seq));
-
- while (usec >= 1000000) {
- usec -= 1000000;
- sec++;
- }
-
- tv->tv_sec = sec;
- tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
-
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
- return -EINVAL;
-
- write_seqlock_irq(&xtime_lock);
- /*
- * This is revolting. We need to set "xtime" correctly. However, the
- * value in this location is the value at the most recent update of
- * wall time. Discover what correction gettimeofday() would have
- * made, and then undo it!
- */
- nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
- nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
-
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
- ntp_clear();
- write_sequnlock_irq(&xtime_lock);
- clock_was_set();
- return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
static int set_rtc_mmss(unsigned long nowtime)
{
int retval;
-
- WARN_ON(irqs_disabled());
+ unsigned long flags;
/* gets recalled with irq locally disabled */
- spin_lock_irq(&rtc_lock);
+ /* XXX - does irqsave resolve this? -johnstul */
+ spin_lock_irqsave(&rtc_lock, flags);
if (efi_enabled)
retval = efi_set_rtc_mmss(nowtime);
else
retval = mach_set_rtc_mmss(nowtime);
- spin_unlock_irq(&rtc_lock);
+ spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
}
@@ -218,35 +130,50 @@ static int set_rtc_mmss(unsigned long nowtime)
int timer_ack;
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- * Note: This function is required to return accurate
- * time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
- return cur_timer->monotonic_clock();
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- if (in_lock_functions(pc))
+#ifdef CONFIG_SMP
+ if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->ebp + 4);
-
+#else
+ unsigned long *sp;
+ if ((regs->xcs & 3) == 0)
+ sp = (unsigned long *)&regs->esp;
+ else
+ sp = (unsigned long *)regs->esp;
+ /* Return address is either directly at stack pointer
+ or above a saved eflags. Eflags has bits 22-31 zero,
+ kernel addresses don't. */
+ if (sp[0] >> 22)
+ return sp[0];
+ if (sp[1] >> 22)
+ return sp[1];
+#endif
+ }
+#endif
return pc;
}
EXPORT_SYMBOL(profile_pc);
-#endif
/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
*/
-static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
+ /*
+ * Here we are in the timer irq handler. We just have irqs locally
+ * disabled but we don't know if the timer_bh is running on the other
+ * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+ * the irq version of write_lock because as just said we have irq
+ * locally disabled. -arca
+ */
+ write_seqlock(&xtime_lock);
+
#ifdef CONFIG_X86_IO_APIC
if (timer_ack) {
/*
@@ -279,27 +206,6 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
irq = inb_p( 0x61 ); /* read the current state */
outb_p( irq|0x80, 0x61 ); /* reset the IRQ */
}
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
- /*
- * Here we are in the timer irq handler. We just have irqs locally
- * disabled but we don't know if the timer_bh is running on the other
- * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
- * the irq version of write_lock because as just said we have irq
- * locally disabled. -arca
- */
- write_seqlock(&xtime_lock);
-
- cur_timer->mark_offset();
-
- do_timer_interrupt(irq, regs);
write_sequnlock(&xtime_lock);
@@ -315,15 +221,16 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
unsigned long get_cmos_time(void)
{
unsigned long retval;
+ unsigned long flags;
- spin_lock(&rtc_lock);
+ spin_lock_irqsave(&rtc_lock, flags);
if (efi_enabled)
retval = efi_get_time();
else
retval = mach_get_cmos_time();
- spin_unlock(&rtc_lock);
+ spin_unlock_irqrestore(&rtc_lock, flags);
return retval;
}
@@ -378,21 +285,19 @@ void notify_arch_cmos_timer(void)
mod_timer(&sync_cmos_timer, jiffies + 1);
}
-static long clock_cmos_diff, sleep_start;
+static long clock_cmos_diff;
+static unsigned long sleep_start;
-static struct timer_opts *last_timer;
static int timer_suspend(struct sys_device *dev, pm_message_t state)
{
/*
* Estimate time zone so that set_time can update the clock
*/
- clock_cmos_diff = -get_cmos_time();
+ unsigned long ctime = get_cmos_time();
+
+ clock_cmos_diff = -ctime;
clock_cmos_diff += get_seconds();
- sleep_start = get_cmos_time();
- last_timer = cur_timer;
- cur_timer = &timer_none;
- if (last_timer->suspend)
- last_timer->suspend(state);
+ sleep_start = ctime;
return 0;
}
@@ -400,25 +305,32 @@ static int timer_resume(struct sys_device *dev)
{
unsigned long flags;
unsigned long sec;
- unsigned long sleep_length;
-
+ unsigned long ctime = get_cmos_time();
+ long sleep_length = (ctime - sleep_start) * HZ;
+ struct timespec ts;
+
+ if (sleep_length < 0) {
+ printk(KERN_WARNING "CMOS clock skew detected in timer resume!\n");
+ /* The time after the resume must not be earlier than the time
+ * before the suspend or some nasty things will happen
+ */
+ sleep_length = 0;
+ ctime = sleep_start;
+ }
#ifdef CONFIG_HPET_TIMER
if (is_hpet_enabled())
hpet_reenable();
#endif
setup_pit_timer();
- sec = get_cmos_time() + clock_cmos_diff;
- sleep_length = (get_cmos_time() - sleep_start) * HZ;
+
+ sec = ctime + clock_cmos_diff;
+ ts.tv_sec = sec;
+ ts.tv_nsec = 0;
+ do_settimeofday(&ts);
write_seqlock_irqsave(&xtime_lock, flags);
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
jiffies_64 += sleep_length;
wall_jiffies += sleep_length;
write_sequnlock_irqrestore(&xtime_lock, flags);
- if (last_timer->resume)
- last_timer->resume();
- cur_timer = last_timer;
- last_timer = NULL;
touch_softlockup_watchdog();
return 0;
}
@@ -451,24 +363,23 @@ extern void (*late_time_init)(void);
/* Duplicate of time_init() below, with hpet_enable part added */
static void __init hpet_time_init(void)
{
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ struct timespec ts;
+ ts.tv_sec = get_cmos_time();
+ ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+
+ do_settimeofday(&ts);
if ((hpet_enable() >= 0) && hpet_use_timer) {
printk("Using HPET for base-timer\n");
}
- cur_timer = select_timer();
- printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
-
time_init_hook();
}
#endif
void __init time_init(void)
{
+ struct timespec ts;
#ifdef CONFIG_HPET_TIMER
if (is_hpet_capable()) {
/*
@@ -479,13 +390,10 @@ void __init time_init(void)
return;
}
#endif
- xtime.tv_sec = get_cmos_time();
- xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
+ ts.tv_sec = get_cmos_time();
+ ts.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
- cur_timer = select_timer();
- printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+ do_settimeofday(&ts);
time_init_hook();
}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index a529f0cdce17..6bf14a4e995e 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -18,7 +18,6 @@
#include <asm/apic.h>
#include <linux/timex.h>
-#include <linux/config.h>
#include <asm/hpet.h>
#include <linux/hpet.h>
@@ -302,23 +301,25 @@ int hpet_rtc_timer_init(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
local_irq_save(flags);
+
cnt = hpet_readl(HPET_COUNTER);
cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
hpet_writel(cnt, HPET_T1_CMP);
hpet_t1_cmp = cnt;
- local_irq_restore(flags);
cfg = hpet_readl(HPET_T1_CFG);
cfg &= ~HPET_TN_PERIODIC;
cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
hpet_writel(cfg, HPET_T1_CFG);
+ local_irq_restore(flags);
+
return 1;
}
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, cnt;
+ unsigned int cfg, cnt, ticks_per_int, lost_ints;
if (unlikely(!(PIE_on | AIE_on | UIE_on))) {
cfg = hpet_readl(HPET_T1_CFG);
@@ -333,10 +334,33 @@ static void hpet_rtc_timer_reinit(void)
hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
/* It is more accurate to use the comparator value than current count.*/
- cnt = hpet_t1_cmp;
- cnt += hpet_tick*HZ/hpet_rtc_int_freq;
- hpet_writel(cnt, HPET_T1_CMP);
- hpet_t1_cmp = cnt;
+ ticks_per_int = hpet_tick * HZ / hpet_rtc_int_freq;
+ hpet_t1_cmp += ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ /*
+ * If the interrupt handler was delayed too long, the write above tries
+ * to schedule the next interrupt in the past and the hardware would
+ * not interrupt until the counter had wrapped around.
+ * So we have to check that the comparator wasn't set to a past time.
+ */
+ cnt = hpet_readl(HPET_COUNTER);
+ if (unlikely((int)(cnt - hpet_t1_cmp) > 0)) {
+ lost_ints = (cnt - hpet_t1_cmp) / ticks_per_int + 1;
+ /* Make sure that, even with the time needed to execute
+ * this code, the next scheduled interrupt has been moved
+ * back to the future: */
+ lost_ints++;
+
+ hpet_t1_cmp += lost_ints * ticks_per_int;
+ hpet_writel(hpet_t1_cmp, HPET_T1_CMP);
+
+ if (PIE_on)
+ PIE_count += lost_ints;
+
+ printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n",
+ hpet_rtc_int_freq);
+ }
}
/*
diff --git a/arch/i386/kernel/timers/Makefile b/arch/i386/kernel/timers/Makefile
deleted file mode 100644
index 8fa12be658dd..000000000000
--- a/arch/i386/kernel/timers/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for x86 timers
-#
-
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
-
-obj-$(CONFIG_X86_CYCLONE_TIMER) += timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER) += timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER) += timer_pm.o
diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c
deleted file mode 100644
index 8163fe0cf1f0..000000000000
--- a/arch/i386/kernel/timers/common.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Common functions used across the timers go here
- */
-
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-
-#include "mach_timer.h"
-
-/* ------ Calibrate the TSC -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
- * Too much 64-bit arithmetic here to do this cleanly in C, and for
- * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
- * output busy loop as low as possible. We avoid reading the CTC registers
- * directly because of the awkward 8-bit access mechanism of the 82C54
- * device.
- */
-
-#define CALIBRATE_TIME (5 * 1000020/HZ)
-
-unsigned long calibrate_tsc(void)
-{
- mach_prepare_counter();
-
- {
- unsigned long startlow, starthigh;
- unsigned long endlow, endhigh;
- unsigned long count;
-
- rdtsc(startlow,starthigh);
- mach_countup(&count);
- rdtsc(endlow,endhigh);
-
-
- /* Error: ECTCNEVERSET */
- if (count <= 1)
- goto bad_ctc;
-
- /* 64-bit subtract - gcc just messes up with long longs */
- __asm__("subl %2,%0\n\t"
- "sbbl %3,%1"
- :"=a" (endlow), "=d" (endhigh)
- :"g" (startlow), "g" (starthigh),
- "0" (endlow), "1" (endhigh));
-
- /* Error: ECPUTOOFAST */
- if (endhigh)
- goto bad_ctc;
-
- /* Error: ECPUTOOSLOW */
- if (endlow <= CALIBRATE_TIME)
- goto bad_ctc;
-
- __asm__("divl %2"
- :"=a" (endlow), "=d" (endhigh)
- :"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
-
- return endlow;
- }
-
- /*
- * The CTC wasn't reliable: we got a hit on the very first read,
- * or the CPU was so fast/slow that the quotient wouldn't fit in
- * 32 bits..
- */
-bad_ctc:
- return 0;
-}
-
-#ifdef CONFIG_HPET_TIMER
-/* ------ Calibrate the TSC using HPET -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
- * Second output is parameter 1 (when non NULL)
- * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
- * calibrate_tsc() calibrates the processor TSC by comparing
- * it to the HPET timer of known frequency.
- * Too much 64-bit arithmetic here to do this cleanly in C
- */
-#define CALIBRATE_CNT_HPET (5 * hpet_tick)
-#define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC)
-
-unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
-{
- unsigned long tsc_startlow, tsc_starthigh;
- unsigned long tsc_endlow, tsc_endhigh;
- unsigned long hpet_start, hpet_end;
- unsigned long result, remain;
-
- hpet_start = hpet_readl(HPET_COUNTER);
- rdtsc(tsc_startlow, tsc_starthigh);
- do {
- hpet_end = hpet_readl(HPET_COUNTER);
- } while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
- rdtsc(tsc_endlow, tsc_endhigh);
-
- /* 64-bit subtract - gcc just messes up with long longs */
- __asm__("subl %2,%0\n\t"
- "sbbl %3,%1"
- :"=a" (tsc_endlow), "=d" (tsc_endhigh)
- :"g" (tsc_startlow), "g" (tsc_starthigh),
- "0" (tsc_endlow), "1" (tsc_endhigh));
-
- /* Error: ECPUTOOFAST */
- if (tsc_endhigh)
- goto bad_calibration;
-
- /* Error: ECPUTOOSLOW */
- if (tsc_endlow <= CALIBRATE_TIME_HPET)
- goto bad_calibration;
-
- ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
- if (remain > (tsc_endlow >> 1))
- result++; /* rounding the result */
-
- if (tsc_hpet_quotient_ptr) {
- unsigned long tsc_hpet_quotient;
-
- ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
- CALIBRATE_CNT_HPET);
- if (remain > (tsc_endlow >> 1))
- tsc_hpet_quotient++; /* rounding the result */
- *tsc_hpet_quotient_ptr = tsc_hpet_quotient;
- }
-
- return result;
-bad_calibration:
- /*
- * the CPU was so fast/slow that the quotient wouldn't fit in
- * 32 bits..
- */
- return 0;
-}
-#endif
-
-
-unsigned long read_timer_tsc(void)
-{
- unsigned long retval;
- rdtscl(retval);
- return retval;
-}
-
-
-/* calculate cpu_khz */
-void init_cpu_khz(void)
-{
- if (cpu_has_tsc) {
- unsigned long tsc_quotient = calibrate_tsc();
- if (tsc_quotient) {
- /* report CPU clock rate in Hz.
- * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
- * clock/second. Our precision is about 100 ppm.
- */
- { unsigned long eax=0, edx=1000;
- __asm__("divl %2"
- :"=a" (cpu_khz), "=d" (edx)
- :"r" (tsc_quotient),
- "0" (eax), "1" (edx));
- printk("Detected %u.%03u MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
- }
- }
- }
-}
-
diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c
deleted file mode 100644
index 7e39ed8e33f8..000000000000
--- a/arch/i386/kernel/timers/timer.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-#ifdef CONFIG_HPET_TIMER
-/*
- * HPET memory read is slower than tsc reads, but is more dependable as it
- * always runs at constant frequency and reduces complexity due to
- * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
- * timer_pit when HPET is active. So, we default to timer_tsc.
- */
-#endif
-/* list of timers, ordered by preference, NULL terminated */
-static struct init_timer_opts* __initdata timers[] = {
-#ifdef CONFIG_X86_CYCLONE_TIMER
- &timer_cyclone_init,
-#endif
-#ifdef CONFIG_HPET_TIMER
- &timer_hpet_init,
-#endif
-#ifdef CONFIG_X86_PM_TIMER
- &timer_pmtmr_init,
-#endif
- &timer_tsc_init,
- &timer_pit_init,
- NULL,
-};
-
-static char clock_override[10] __initdata;
-
-static int __init clock_setup(char* str)
-{
- if (str)
- strlcpy(clock_override, str, sizeof(clock_override));
- return 1;
-}
-__setup("clock=", clock_setup);
-
-
-/* The chosen timesource has been found to be bad.
- * Fall back to a known good timesource (the PIT)
- */
-void clock_fallback(void)
-{
- cur_timer = &timer_pit;
-}
-
-/* iterates through the list of timers, returning the first
- * one that initializes successfully.
- */
-struct timer_opts* __init select_timer(void)
-{
- int i = 0;
-
- /* find most preferred working timer */
- while (timers[i]) {
- if (timers[i]->init)
- if (timers[i]->init(clock_override) == 0)
- return timers[i]->opts;
- ++i;
- }
-
- panic("select_timer: Cannot find a suitable timer\n");
- return NULL;
-}
-
-int read_current_timer(unsigned long *timer_val)
-{
- if (cur_timer->read_timer) {
- *timer_val = cur_timer->read_timer();
- return 0;
- }
- return -1;
-}
diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c
deleted file mode 100644
index 13892a65c941..000000000000
--- a/arch/i386/kernel/timers/timer_cyclone.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/* Cyclone-timer:
- * This code implements timer_ops for the cyclone counter found
- * on IBM x440, x360, and other Summit based systems.
- *
- * Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
-
-#include "io_ports.h"
-
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
-int use_cyclone = 0;
-
-static u32* volatile cyclone_timer; /* Cyclone MPMC0 register */
-static u32 last_cyclone_low;
-static u32 last_cyclone_high;
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* helper macro to atomically read both cyclone counter registers */
-#define read_cyclone_counter(low,high) \
- do{ \
- high = cyclone_timer[1]; low = cyclone_timer[0]; \
- } while (high != cyclone_timer[1]);
-
-
-static void mark_offset_cyclone(void)
-{
- unsigned long lost, delay;
- unsigned long delta = last_cyclone_low;
- int count;
- unsigned long long this_offset, last_offset;
-
- write_seqlock(&monotonic_lock);
- last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-
- spin_lock(&i8253_lock);
- read_cyclone_counter(last_cyclone_low,last_cyclone_high);
-
- /* read values for delay_at_last_interrupt */
- outb_p(0x00, 0x43); /* latch the count ASAP */
-
- count = inb_p(0x40); /* read the latched count */
- count |= inb(0x40) << 8;
-
- /*
- * VIA686a test code... reset the latch if count > max + 1
- * from timer_pit.c - cjb
- */
- if (count > LATCH) {
- outb_p(0x34, PIT_MODE);
- outb_p(LATCH & 0xff, PIT_CH0);
- outb(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
- spin_unlock(&i8253_lock);
-
- /* lost tick compensation */
- delta = last_cyclone_low - delta;
- delta /= (CYCLONE_TIMER_FREQ/1000000);
- delta += delay_at_last_interrupt;
- lost = delta/(1000000/HZ);
- delay = delta%(1000000/HZ);
- if (lost >= 2)
- jiffies_64 += lost-1;
-
- /* update the monotonic base value */
- this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
- monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
- write_sequnlock(&monotonic_lock);
-
- /* calculate delay_at_last_interrupt */
- count = ((LATCH-1) - count) * TICK_SIZE;
- delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-
- /* catch corner case where tick rollover occured
- * between cyclone and pit reads (as noted when
- * usec delta is > 90% # of usecs/tick)
- */
- if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
- jiffies_64++;
-}
-
-static unsigned long get_offset_cyclone(void)
-{
- u32 offset;
-
- if(!cyclone_timer)
- return delay_at_last_interrupt;
-
- /* Read the cyclone timer */
- offset = cyclone_timer[0];
-
- /* .. relative to previous jiffy */
- offset = offset - last_cyclone_low;
-
- /* convert cyclone ticks to microseconds */
- /* XXX slow, can we speed this up? */
- offset = offset/(CYCLONE_TIMER_FREQ/1000000);
-
- /* our adjusted time offset in microseconds */
- return delay_at_last_interrupt + offset;
-}
-
-static unsigned long long monotonic_clock_cyclone(void)
-{
- u32 now_low, now_high;
- unsigned long long last_offset, this_offset, base;
- unsigned long long ret;
- unsigned seq;
-
- /* atomically read monotonic base & last_offset */
- do {
- seq = read_seqbegin(&monotonic_lock);
- last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
- base = monotonic_base;
- } while (read_seqretry(&monotonic_lock, seq));
-
-
- /* Read the cyclone counter */
- read_cyclone_counter(now_low,now_high);
- this_offset = ((unsigned long long)now_high<<32)|now_low;
-
- /* convert to nanoseconds */
- ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
- return ret * (1000000000 / CYCLONE_TIMER_FREQ);
-}
-
-static int __init init_cyclone(char* override)
-{
- u32* reg;
- u32 base; /* saved cyclone base address */
- u32 pageaddr; /* page that contains cyclone_timer register */
- u32 offset; /* offset from pageaddr to cyclone_timer register */
- int i;
-
- /* check clock override */
- if (override[0] && strncmp(override,"cyclone",7))
- return -ENODEV;
-
- /*make sure we're on a summit box*/
- if(!use_cyclone) return -ENODEV;
-
- printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
- /* find base address */
- pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
- offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
- set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
- reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
- if(!reg){
- printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
- return -ENODEV;
- }
- base = *reg;
- if(!base){
- printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
- return -ENODEV;
- }
-
- /* setup PMCC */
- pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
- offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
- set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
- reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
- if(!reg){
- printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
- return -ENODEV;
- }
- reg[0] = 0x00000001;
-
- /* setup MPCS */
- pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
- offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
- set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
- reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
- if(!reg){
- printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
- return -ENODEV;
- }
- reg[0] = 0x00000001;
-
- /* map in cyclone_timer */
- pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
- offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
- set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
- cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
- if(!cyclone_timer){
- printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
- return -ENODEV;
- }
-
- /*quick test to make sure its ticking*/
- for(i=0; i<3; i++){
- u32 old = cyclone_timer[0];
- int stall = 100;
- while(stall--) barrier();
- if(cyclone_timer[0] == old){
- printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
- cyclone_timer = 0;
- return -ENODEV;
- }
- }
-
- init_cpu_khz();
-
- /* Everything looks good! */
- return 0;
-}
-
-
-static void delay_cyclone(unsigned long loops)
-{
- unsigned long bclock, now;
- if(!cyclone_timer)
- return;
- bclock = cyclone_timer[0];
- do {
- rep_nop();
- now = cyclone_timer[0];
- } while ((now-bclock) < loops);
-}
-/************************************************************/
-
-/* cyclone timer_opts struct */
-static struct timer_opts timer_cyclone = {
- .name = "cyclone",
- .mark_offset = mark_offset_cyclone,
- .get_offset = get_offset_cyclone,
- .monotonic_clock = monotonic_clock_cyclone,
- .delay = delay_cyclone,
-};
-
-struct init_timer_opts __initdata timer_cyclone_init = {
- .init = init_cyclone,
- .opts = &timer_cyclone,
-};
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
deleted file mode 100644
index 17a6fe7166e7..000000000000
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-#include <asm/hpet.h>
-
-static unsigned long hpet_usec_quotient __read_mostly; /* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient __read_mostly; /* convert tsc to hpet clks */
-static unsigned long hpet_last; /* hpet counter value at last tick*/
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better percision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale __read_mostly;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
- cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
- return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static unsigned long long monotonic_clock_hpet(void)
-{
- unsigned long long last_offset, this_offset, base;
- unsigned seq;
-
- /* atomically read monotonic base & last_offset */
- do {
- seq = read_seqbegin(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- base = monotonic_base;
- } while (read_seqretry(&monotonic_lock, seq));
-
- /* Read the Time Stamp Counter */
- rdtscll(this_offset);
-
- /* return the value in ns */
- return base + cycles_2_ns(this_offset - last_offset);
-}
-
-static unsigned long get_offset_hpet(void)
-{
- register unsigned long eax, edx;
-
- eax = hpet_readl(HPET_COUNTER);
- eax -= hpet_last; /* hpet delta */
- eax = min(hpet_tick, eax);
- /*
- * Time offset = (hpet delta) * ( usecs per HPET clock )
- * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
- * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
- *
- * Where,
- * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
- *
- * Using a mull instead of a divl saves some cycles in critical path.
- */
- ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
-
- /* our adjusted time offset in microseconds */
- return edx;
-}
-
-static void mark_offset_hpet(void)
-{
- unsigned long long this_offset, last_offset;
- unsigned long offset;
-
- write_seqlock(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- rdtsc(last_tsc_low, last_tsc_high);
-
- if (hpet_use_timer)
- offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
- else
- offset = hpet_readl(HPET_COUNTER);
- if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) {
- int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1;
- jiffies_64 += lost_ticks;
- }
- hpet_last = offset;
-
- /* update the monotonic base value */
- this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- monotonic_base += cycles_2_ns(this_offset - last_offset);
- write_sequnlock(&monotonic_lock);
-}
-
-static void delay_hpet(unsigned long loops)
-{
- unsigned long hpet_start, hpet_end;
- unsigned long eax;
-
- /* loops is the number of cpu cycles. Convert it to hpet clocks */
- ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
-
- hpet_start = hpet_readl(HPET_COUNTER);
- do {
- rep_nop();
- hpet_end = hpet_readl(HPET_COUNTER);
- } while ((hpet_end - hpet_start) < (loops));
-}
-
-static struct timer_opts timer_hpet;
-
-static int __init init_hpet(char* override)
-{
- unsigned long result, remain;
-
- /* check clock override */
- if (override[0] && strncmp(override,"hpet",4))
- return -ENODEV;
-
- if (!is_hpet_enabled())
- return -ENODEV;
-
- printk("Using HPET for gettimeofday\n");
- if (cpu_has_tsc) {
- unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
- if (tsc_quotient) {
- /* report CPU clock rate in Hz.
- * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
- * clock/second. Our precision is about 100 ppm.
- */
- { unsigned long eax=0, edx=1000;
- ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
- eax, edx);
- printk("Detected %u.%03u MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
- }
- set_cyc2ns_scale(cpu_khz);
- }
- /* set this only when cpu_has_tsc */
- timer_hpet.read_timer = read_timer_tsc;
- }
-
- /*
- * Math to calculate hpet to usec multiplier
- * Look for the comments at get_offset_hpet()
- */
- ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
- if (remain > (hpet_tick >> 1))
- result++; /* rounding the result */
- hpet_usec_quotient = result;
-
- return 0;
-}
-
-static int hpet_resume(void)
-{
- write_seqlock(&monotonic_lock);
- /* Assume this is the last mark offset time */
- rdtsc(last_tsc_low, last_tsc_high);
-
- if (hpet_use_timer)
- hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
- else
- hpet_last = hpet_readl(HPET_COUNTER);
- write_sequnlock(&monotonic_lock);
- return 0;
-}
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_hpet __read_mostly = {
- .name = "hpet",
- .mark_offset = mark_offset_hpet,
- .get_offset = get_offset_hpet,
- .monotonic_clock = monotonic_clock_hpet,
- .delay = delay_hpet,
- .resume = hpet_resume,
-};
-
-struct init_timer_opts __initdata timer_hpet_init = {
- .init = init_hpet,
- .opts = &timer_hpet,
-};
diff --git a/arch/i386/kernel/timers/timer_none.c b/arch/i386/kernel/timers/timer_none.c
deleted file mode 100644
index 4ea2f414dbbd..000000000000
--- a/arch/i386/kernel/timers/timer_none.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <linux/init.h>
-#include <asm/timer.h>
-
-static void mark_offset_none(void)
-{
- /* nothing needed */
-}
-
-static unsigned long get_offset_none(void)
-{
- return 0;
-}
-
-static unsigned long long monotonic_clock_none(void)
-{
- return 0;
-}
-
-static void delay_none(unsigned long loops)
-{
- int d0;
- __asm__ __volatile__(
- "\tjmp 1f\n"
- ".align 16\n"
- "1:\tjmp 2f\n"
- ".align 16\n"
- "2:\tdecl %0\n\tjns 2b"
- :"=&a" (d0)
- :"0" (loops));
-}
-
-/* none timer_opts struct */
-struct timer_opts timer_none = {
- .name = "none",
- .mark_offset = mark_offset_none,
- .get_offset = get_offset_none,
- .monotonic_clock = monotonic_clock_none,
- .delay = delay_none,
-};
diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
deleted file mode 100644
index b9b6bd56b9ba..000000000000
--- a/arch/i386/kernel/timers/timer_pit.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/timex.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8253.h>
-
-#include "do_timer.h"
-#include "io_ports.h"
-
-static int count_p; /* counter in get_offset_pit() */
-
-static int __init init_pit(char* override)
-{
- /* check clock override */
- if (override[0] && strncmp(override,"pit",3))
- printk(KERN_ERR "Warning: clock= override failed. Defaulting "
- "to PIT\n");
- init_cpu_khz();
- count_p = LATCH;
- return 0;
-}
-
-static void mark_offset_pit(void)
-{
- /* nothing needed */
-}
-
-static unsigned long long monotonic_clock_pit(void)
-{
- return 0;
-}
-
-static void delay_pit(unsigned long loops)
-{
- int d0;
- __asm__ __volatile__(
- "\tjmp 1f\n"
- ".align 16\n"
- "1:\tjmp 2f\n"
- ".align 16\n"
- "2:\tdecl %0\n\tjns 2b"
- :"=&a" (d0)
- :"0" (loops));
-}
-
-
-/* This function must be called with xtime_lock held.
- * It was inspired by Steve McCanne's microtime-i386 for BSD. -- jrs
- *
- * However, the pc-audio speaker driver changes the divisor so that
- * it gets interrupted rather more often - it loads 64 into the
- * counter rather than 11932! This has an adverse impact on
- * do_gettimeoffset() -- it stops working! What is also not
- * good is that the interval that our timer function gets called
- * is no longer 10.0002 ms, but 9.9767 ms. To get around this
- * would require using a different timing source. Maybe someone
- * could use the RTC - I know that this can interrupt at frequencies
- * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
- * it so that at startup, the timer code in sched.c would select
- * using either the RTC or the 8253 timer. The decision would be
- * based on whether there was any other device around that needed
- * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
- * and then do some jiggery to have a version of do_timer that
- * advanced the clock by 1/1024 s. Every time that reached over 1/100
- * of a second, then do all the old code. If the time was kept correct
- * then do_gettimeoffset could just return 0 - there is no low order
- * divider that can be accessed.
- *
- * Ideally, you would be able to use the RTC for the speaker driver,
- * but it appears that the speaker driver really needs interrupt more
- * often than every 120 us or so.
- *
- * Anyway, this needs more thought.... pjsg (1993-08-28)
- *
- * If you are really that interested, you should be reading
- * comp.protocols.time.ntp!
- */
-
-static unsigned long get_offset_pit(void)
-{
- int count;
- unsigned long flags;
- static unsigned long jiffies_p = 0;
-
- /*
- * cache volatile jiffies temporarily; we have xtime_lock.
- */
- unsigned long jiffies_t;
-
- spin_lock_irqsave(&i8253_lock, flags);
- /* timer count may underflow right here */
- outb_p(0x00, PIT_MODE); /* latch the count ASAP */
-
- count = inb_p(PIT_CH0); /* read the latched count */
-
- /*
- * We do this guaranteed double memory access instead of a _p
- * postfix in the previous port access. Wheee, hackady hack
- */
- jiffies_t = jiffies;
-
- count |= inb_p(PIT_CH0) << 8;
-
- /* VIA686a test code... reset the latch if count > max + 1 */
- if (count > LATCH) {
- outb_p(0x34, PIT_MODE);
- outb_p(LATCH & 0xff, PIT_CH0);
- outb(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- /*
- * avoiding timer inconsistencies (they are rare, but they happen)...
- * there are two kinds of problems that must be avoided here:
- * 1. the timer counter underflows
- * 2. hardware problem with the timer, not giving us continuous time,
- * the counter does small "jumps" upwards on some Pentium systems,
- * (see c't 95/10 page 335 for Neptun bug.)
- */
-
- if( jiffies_t == jiffies_p ) {
- if( count > count_p ) {
- /* the nutcase */
- count = do_timer_overflow(count);
- }
- } else
- jiffies_p = jiffies_t;
-
- count_p = count;
-
- spin_unlock_irqrestore(&i8253_lock, flags);
-
- count = ((LATCH-1) - count) * TICK_SIZE;
- count = (count + LATCH/2) / LATCH;
-
- return count;
-}
-
-
-/* tsc timer_opts struct */
-struct timer_opts timer_pit = {
- .name = "pit",
- .mark_offset = mark_offset_pit,
- .get_offset = get_offset_pit,
- .monotonic_clock = monotonic_clock_pit,
- .delay = delay_pit,
-};
-
-struct init_timer_opts __initdata timer_pit_init = {
- .init = init_pit,
- .opts = &timer_pit,
-};
-
-void setup_pit_timer(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&i8253_lock, flags);
- outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
- udelay(10);
- outb_p(LATCH & 0xff , PIT_CH0); /* LSB */
- udelay(10);
- outb(LATCH >> 8 , PIT_CH0); /* MSB */
- spin_unlock_irqrestore(&i8253_lock, flags);
-}
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
deleted file mode 100644
index 144e94a04933..000000000000
--- a/arch/i386/kernel/timers/timer_pm.c
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- */
-
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <asm/types.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-
-#include <linux/timex.h>
-#include "mach_timer.h"
-
-/* Number of PMTMR ticks expected during calibration run */
-#define PMTMR_TICKS_PER_SEC 3579545
-#define PMTMR_EXPECTED_RATE \
- ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
-
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/acpi/boot.c */
-u32 pmtmr_ioport = 0;
-
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_tick;
-static u32 offset_delay;
-
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-static int pmtmr_need_workaround __read_mostly = 1;
-
-/*helper function to safely read acpi pm timesource*/
-static inline u32 read_pmtmr(void)
-{
- if (pmtmr_need_workaround) {
- u32 v1, v2, v3;
-
- /* It has been reported that because of various broken
- * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
- * source is not latched, so you must read it multiple
- * times to insure a safe value is read.
- */
- do {
- v1 = inl(pmtmr_ioport);
- v2 = inl(pmtmr_ioport);
- v3 = inl(pmtmr_ioport);
- } while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
- || (v3 > v1 && v3 < v2));
-
- /* mask the output to 24 bits */
- return v2 & ACPI_PM_MASK;
- }
-
- return inl(pmtmr_ioport) & ACPI_PM_MASK;
-}
-
-
-/*
- * Some boards have the PMTMR running way too fast. We check
- * the PMTMR rate against PIT channel 2 to catch these cases.
- */
-static int verify_pmtmr_rate(void)
-{
- u32 value1, value2;
- unsigned long count, delta;
-
- mach_prepare_counter();
- value1 = read_pmtmr();
- mach_countup(&count);
- value2 = read_pmtmr();
- delta = (value2 - value1) & ACPI_PM_MASK;
-
- /* Check that the PMTMR delta is within 5% of what we expect */
- if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
- delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
- printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
- return -1;
- }
-
- return 0;
-}
-
-
-static int init_pmtmr(char* override)
-{
- u32 value1, value2;
- unsigned int i;
-
- if (override[0] && strncmp(override,"pmtmr",5))
- return -ENODEV;
-
- if (!pmtmr_ioport)
- return -ENODEV;
-
- /* we use the TSC for delay_pmtmr, so make sure it exists */
- if (!cpu_has_tsc)
- return -ENODEV;
-
- /* "verify" this timing source */
- value1 = read_pmtmr();
- for (i = 0; i < 10000; i++) {
- value2 = read_pmtmr();
- if (value2 == value1)
- continue;
- if (value2 > value1)
- goto pm_good;
- if ((value2 < value1) && ((value2) < 0xFFF))
- goto pm_good;
- printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
- return -EINVAL;
- }
- printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
- return -ENODEV;
-
-pm_good:
- if (verify_pmtmr_rate() != 0)
- return -ENODEV;
-
- init_cpu_khz();
- return 0;
-}
-
-static inline u32 cyc2us(u32 cycles)
-{
- /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
- * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
- *
- * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
- * easily be multiplied with 286 (=0x11E) without having to fear
- * u32 overflows.
- */
- cycles *= 286;
- return (cycles >> 10);
-}
-
-/*
- * this gets called during each timer interrupt
- * - Called while holding the writer xtime_lock
- */
-static void mark_offset_pmtmr(void)
-{
- u32 lost, delta, last_offset;
- static int first_run = 1;
- last_offset = offset_tick;
-
- write_seqlock(&monotonic_lock);
-
- offset_tick = read_pmtmr();
-
- /* calculate tick interval */
- delta = (offset_tick - last_offset) & ACPI_PM_MASK;
-
- /* convert to usecs */
- delta = cyc2us(delta);
-
- /* update the monotonic base value */
- monotonic_base += delta * NSEC_PER_USEC;
- write_sequnlock(&monotonic_lock);
-
- /* convert to ticks */
- delta += offset_delay;
- lost = delta / (USEC_PER_SEC / HZ);
- offset_delay = delta % (USEC_PER_SEC / HZ);
-
-
- /* compensate for lost ticks */
- if (lost >= 2)
- jiffies_64 += lost - 1;
-
- /* don't calculate delay for first run,
- or if we've got less then a tick */
- if (first_run || (lost < 1)) {
- first_run = 0;
- offset_delay = 0;
- }
-}
-
-static int pmtmr_resume(void)
-{
- write_seqlock(&monotonic_lock);
- /* Assume this is the last mark offset time */
- offset_tick = read_pmtmr();
- write_sequnlock(&monotonic_lock);
- return 0;
-}
-
-static unsigned long long monotonic_clock_pmtmr(void)
-{
- u32 last_offset, this_offset;
- unsigned long long base, ret;
- unsigned seq;
-
-
- /* atomically read monotonic base & last_offset */
- do {
- seq = read_seqbegin(&monotonic_lock);
- last_offset = offset_tick;
- base = monotonic_base;
- } while (read_seqretry(&monotonic_lock, seq));
-
- /* Read the pmtmr */
- this_offset = read_pmtmr();
-
- /* convert to nanoseconds */
- ret = (this_offset - last_offset) & ACPI_PM_MASK;
- ret = base + (cyc2us(ret) * NSEC_PER_USEC);
- return ret;
-}
-
-static void delay_pmtmr(unsigned long loops)
-{
- unsigned long bclock, now;
-
- rdtscl(bclock);
- do
- {
- rep_nop();
- rdtscl(now);
- } while ((now-bclock) < loops);
-}
-
-
-/*
- * get the offset (in microseconds) from the last call to mark_offset()
- * - Called holding a reader xtime_lock
- */
-static unsigned long get_offset_pmtmr(void)
-{
- u32 now, offset, delta = 0;
-
- offset = offset_tick;
- now = read_pmtmr();
- delta = (now - offset)&ACPI_PM_MASK;
-
- return (unsigned long) offset_delay + cyc2us(delta);
-}
-
-
-/* acpi timer_opts struct */
-static struct timer_opts timer_pmtmr = {
- .name = "pmtmr",
- .mark_offset = mark_offset_pmtmr,
- .get_offset = get_offset_pmtmr,
- .monotonic_clock = monotonic_clock_pmtmr,
- .delay = delay_pmtmr,
- .read_timer = read_timer_tsc,
- .resume = pmtmr_resume,
-};
-
-struct init_timer_opts __initdata timer_pmtmr_init = {
- .init = init_pmtmr,
- .opts = &timer_pmtmr,
-};
-
-#ifdef CONFIG_PCI
-/*
- * PIIX4 Errata:
- *
- * The power management timer may return improper results when read.
- * Although the timer value settles properly after incrementing,
- * while incrementing there is a 3 ns window every 69.8 ns where the
- * timer value is indeterminate (a 4.2% chance that the data will be
- * incorrect when read). As a result, the ACPI free running count up
- * timer specification is violated due to erroneous reads.
- */
-static int __init pmtmr_bug_check(void)
-{
- static struct pci_device_id gray_list[] __initdata = {
- /* these chipsets may have bug. */
- { PCI_DEVICE(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82801DB_0) },
- { },
- };
- struct pci_dev *dev;
- int pmtmr_has_bug = 0;
- u8 rev;
-
- if (cur_timer != &timer_pmtmr || !pmtmr_need_workaround)
- return 0;
-
- dev = pci_get_device(PCI_VENDOR_ID_INTEL,
- PCI_DEVICE_ID_INTEL_82371AB_3, NULL);
- if (dev) {
- pci_read_config_byte(dev, PCI_REVISION_ID, &rev);
- /* the bug has been fixed in PIIX4M */
- if (rev < 3) {
- printk(KERN_WARNING "* Found PM-Timer Bug on this "
- "chipset. Due to workarounds for a bug,\n"
- "* this time source is slow. Consider trying "
- "other time sources (clock=)\n");
- pmtmr_has_bug = 1;
- }
- pci_dev_put(dev);
- }
-
- if (pci_dev_present(gray_list)) {
- printk(KERN_WARNING "* This chipset may have PM-Timer Bug. Due"
- " to workarounds for a bug,\n"
- "* this time source is slow. If you are sure your timer"
- " does not have\n"
- "* this bug, please use \"pmtmr_good\" to disable the "
- "workaround\n");
- pmtmr_has_bug = 1;
- }
-
- if (!pmtmr_has_bug)
- pmtmr_need_workaround = 0;
-
- return 0;
-}
-device_initcall(pmtmr_bug_check);
-#endif
-
-static int __init pmtr_good_setup(char *__str)
-{
- pmtmr_need_workaround = 0;
- return 1;
-}
-__setup("pmtmr_good", pmtr_good_setup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
deleted file mode 100644
index f1187ddb0d0f..000000000000
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ /dev/null
@@ -1,617 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- *
- * 2004-06-25 Jesper Juhl
- * moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- * failing to inline.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/cpufreq.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-/* processor.h for distable_tsc flag */
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-
-#include <asm/hpet.h>
-#include <asm/i8253.h>
-
-#ifdef CONFIG_HPET_TIMER
-static unsigned long hpet_usec_quotient;
-static unsigned long hpet_last;
-static struct timer_opts timer_tsc;
-#endif
-
-static inline void cpufreq_delayed_get(void);
-
-int tsc_disable __devinitdata = 0;
-
-static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* Avoid compensating for lost ticks before TSCs are synched */
-static int detect_lost_ticks;
-static int __init start_lost_tick_compensation(void)
-{
- detect_lost_ticks = 1;
- return 0;
-}
-late_initcall(start_lost_tick_compensation);
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- * basic equation:
- * ns = cycles / (freq / ns_per_sec)
- * ns = cycles * (ns_per_sec / freq)
- * ns = cycles * (10^9 / (cpu_khz * 10^3))
- * ns = cycles * (10^6 / cpu_khz)
- *
- * Then we use scaling math (suggested by george@mvista.com) to get:
- * ns = cycles * (10^6 * SC / cpu_khz) / SC
- * ns = cycles * cyc2ns_scale / SC
- *
- * And since SC is a constant power of two, we can convert the div
- * into a shift.
- *
- * We can use khz divisor instead of mhz to keep a better percision, since
- * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- * (mathieu.desnoyers@polymtl.ca)
- *
- * -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale __read_mostly;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_khz)
-{
- cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
- return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static int count2; /* counter for mark_offset_tsc() */
-
-/* Cached *multiplier* to convert TSC counts to microseconds.
- * (see the equation below).
- * Equal to 2^32 * (1 / (clocks per usec) ).
- * Initialized in time_init.
- */
-static unsigned long fast_gettimeoffset_quotient;
-
-static unsigned long get_offset_tsc(void)
-{
- register unsigned long eax, edx;
-
- /* Read the Time Stamp Counter */
-
- rdtsc(eax,edx);
-
- /* .. relative to previous jiffy (32 bits is enough) */
- eax -= last_tsc_low; /* tsc_low delta */
-
- /*
- * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
- * = (tsc_low delta) * (usecs_per_clock)
- * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
- *
- * Using a mull instead of a divl saves up to 31 clock cycles
- * in the critical path.
- */
-
- __asm__("mull %2"
- :"=a" (eax), "=d" (edx)
- :"rm" (fast_gettimeoffset_quotient),
- "0" (eax));
-
- /* our adjusted time offset in microseconds */
- return delay_at_last_interrupt + edx;
-}
-
-static unsigned long long monotonic_clock_tsc(void)
-{
- unsigned long long last_offset, this_offset, base;
- unsigned seq;
-
- /* atomically read monotonic base & last_offset */
- do {
- seq = read_seqbegin(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- base = monotonic_base;
- } while (read_seqretry(&monotonic_lock, seq));
-
- /* Read the Time Stamp Counter */
- rdtscll(this_offset);
-
- /* return the value in ns */
- return base + cycles_2_ns(this_offset - last_offset);
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long sched_clock(void)
-{
- unsigned long long this_offset;
-
- /*
- * In the NUMA case we dont use the TSC as they are not
- * synchronized across all CPUs.
- */
-#ifndef CONFIG_NUMA
- if (!use_tsc)
-#endif
- /* no locking but a rare wrong value is not a big deal */
- return jiffies_64 * (1000000000 / HZ);
-
- /* Read the Time Stamp Counter */
- rdtscll(this_offset);
-
- /* return the value in ns */
- return cycles_2_ns(this_offset);
-}
-
-static void delay_tsc(unsigned long loops)
-{
- unsigned long bclock, now;
-
- rdtscl(bclock);
- do
- {
- rep_nop();
- rdtscl(now);
- } while ((now-bclock) < loops);
-}
-
-#ifdef CONFIG_HPET_TIMER
-static void mark_offset_tsc_hpet(void)
-{
- unsigned long long this_offset, last_offset;
- unsigned long offset, temp, hpet_current;
-
- write_seqlock(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- /*
- * It is important that these two operations happen almost at
- * the same time. We do the RDTSC stuff first, since it's
- * faster. To avoid any inconsistencies, we need interrupts
- * disabled locally.
- */
- /*
- * Interrupts are just disabled locally since the timer irq
- * has the SA_INTERRUPT flag set. -arca
- */
- /* read Pentium cycle counter */
-
- hpet_current = hpet_readl(HPET_COUNTER);
- rdtsc(last_tsc_low, last_tsc_high);
-
- /* lost tick compensation */
- offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
- if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))
- && detect_lost_ticks) {
- int lost_ticks = (offset - hpet_last) / hpet_tick;
- jiffies_64 += lost_ticks;
- }
- hpet_last = hpet_current;
-
- /* update the monotonic base value */
- this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- monotonic_base += cycles_2_ns(this_offset - last_offset);
- write_sequnlock(&monotonic_lock);
-
- /* calculate delay_at_last_interrupt */
- /*
- * Time offset = (hpet delta) * ( usecs per HPET clock )
- * = (hpet delta) * ( usecs per tick / HPET clocks per tick)
- * = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
- * Where,
- * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
- */
- delay_at_last_interrupt = hpet_current - offset;
- ASM_MUL64_REG(temp, delay_at_last_interrupt,
- hpet_usec_quotient, delay_at_last_interrupt);
-}
-#endif
-
-
-#ifdef CONFIG_CPU_FREQ
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(void *v)
-{
- unsigned int cpu;
- for_each_online_cpu(cpu) {
- cpufreq_get(cpu);
- }
- cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void)
-{
- if (cpufreq_init && !cpufreq_delayed_issched) {
- cpufreq_delayed_issched = 1;
- printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
- schedule_work(&cpufreq_delayed_get_work);
- }
-}
-
-/* If the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-#ifndef CONFIG_SMP
-static unsigned long fast_gettimeoffset_ref = 0;
-static unsigned int cpu_khz_ref = 0;
-#endif
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
-{
- struct cpufreq_freqs *freq = data;
-
- if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
- write_seqlock_irq(&xtime_lock);
- if (!ref_freq) {
- if (!freq->old){
- ref_freq = freq->new;
- goto end;
- }
- ref_freq = freq->old;
- loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
-#ifndef CONFIG_SMP
- fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
- cpu_khz_ref = cpu_khz;
-#endif
- }
-
- if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
- (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
- (val == CPUFREQ_RESUMECHANGE)) {
- if (!(freq->flags & CPUFREQ_CONST_LOOPS))
- cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-#ifndef CONFIG_SMP
- if (cpu_khz)
- cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
- if (use_tsc) {
- if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
- fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
- set_cyc2ns_scale(cpu_khz);
- }
- }
-#endif
- }
-
-end:
- if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
- write_sequnlock_irq(&xtime_lock);
-
- return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
- .notifier_call = time_cpufreq_notifier
-};
-
-
-static int __init cpufreq_tsc(void)
-{
- int ret;
- INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
- ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
- CPUFREQ_TRANSITION_NOTIFIER);
- if (!ret)
- cpufreq_init = 1;
- return ret;
-}
-core_initcall(cpufreq_tsc);
-
-#else /* CONFIG_CPU_FREQ */
-static inline void cpufreq_delayed_get(void) { return; }
-#endif
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
- unsigned int cpu_khz_old = cpu_khz;
-
- if (cpu_has_tsc) {
- local_irq_disable();
- init_cpu_khz();
- local_irq_enable();
- cpu_data[0].loops_per_jiffy =
- cpufreq_scale(cpu_data[0].loops_per_jiffy,
- cpu_khz_old,
- cpu_khz);
- return 0;
- } else
- return -ENODEV;
-#else
- return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-static void mark_offset_tsc(void)
-{
- unsigned long lost,delay;
- unsigned long delta = last_tsc_low;
- int count;
- int countmp;
- static int count1 = 0;
- unsigned long long this_offset, last_offset;
- static int lost_count = 0;
-
- write_seqlock(&monotonic_lock);
- last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- /*
- * It is important that these two operations happen almost at
- * the same time. We do the RDTSC stuff first, since it's
- * faster. To avoid any inconsistencies, we need interrupts
- * disabled locally.
- */
-
- /*
- * Interrupts are just disabled locally since the timer irq
- * has the SA_INTERRUPT flag set. -arca
- */
-
- /* read Pentium cycle counter */
-
- rdtsc(last_tsc_low, last_tsc_high);
-
- spin_lock(&i8253_lock);
- outb_p(0x00, PIT_MODE); /* latch the count ASAP */
-
- count = inb_p(PIT_CH0); /* read the latched count */
- count |= inb(PIT_CH0) << 8;
-
- /*
- * VIA686a test code... reset the latch if count > max + 1
- * from timer_pit.c - cjb
- */
- if (count > LATCH) {
- outb_p(0x34, PIT_MODE);
- outb_p(LATCH & 0xff, PIT_CH0);
- outb(LATCH >> 8, PIT_CH0);
- count = LATCH - 1;
- }
-
- spin_unlock(&i8253_lock);
-
- if (pit_latch_buggy) {
- /* get center value of last 3 time lutch */
- if ((count2 >= count && count >= count1)
- || (count1 >= count && count >= count2)) {
- count2 = count1; count1 = count;
- } else if ((count1 >= count2 && count2 >= count)
- || (count >= count2 && count2 >= count1)) {
- countmp = count;count = count2;
- count2 = count1;count1 = countmp;
- } else {
- count2 = count1; count1 = count; count = count1;
- }
- }
-
- /* lost tick compensation */
- delta = last_tsc_low - delta;
- {
- register unsigned long eax, edx;
- eax = delta;
- __asm__("mull %2"
- :"=a" (eax), "=d" (edx)
- :"rm" (fast_gettimeoffset_quotient),
- "0" (eax));
- delta = edx;
- }
- delta += delay_at_last_interrupt;
- lost = delta/(1000000/HZ);
- delay = delta%(1000000/HZ);
- if (lost >= 2 && detect_lost_ticks) {
- jiffies_64 += lost-1;
-
- /* sanity check to ensure we're not always losing ticks */
- if (lost_count++ > 100) {
- printk(KERN_WARNING "Losing too many ticks!\n");
- printk(KERN_WARNING "TSC cannot be used as a timesource. \n");
- printk(KERN_WARNING "Possible reasons for this are:\n");
- printk(KERN_WARNING " You're running with Speedstep,\n");
- printk(KERN_WARNING " You don't have DMA enabled for your hard disk (see hdparm),\n");
- printk(KERN_WARNING " Incorrect TSC synchronization on an SMP system (see dmesg).\n");
- printk(KERN_WARNING "Falling back to a sane timesource now.\n");
-
- clock_fallback();
- }
- /* ... but give the TSC a fair chance */
- if (lost_count > 25)
- cpufreq_delayed_get();
- } else
- lost_count = 0;
- /* update the monotonic base value */
- this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
- monotonic_base += cycles_2_ns(this_offset - last_offset);
- write_sequnlock(&monotonic_lock);
-
- /* calculate delay_at_last_interrupt */
- count = ((LATCH-1) - count) * TICK_SIZE;
- delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
- /* catch corner case where tick rollover occured
- * between tsc and pit reads (as noted when
- * usec delta is > 90% # of usecs/tick)
- */
- if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
- jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
- /* check clock override */
- if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled()) {
- printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
- } else
-#endif
- {
- return -ENODEV;
- }
- }
-
- /*
- * If we have APM enabled or the CPU clock speed is variable
- * (CPU stops clock on HLT or slows clock to save power)
- * then the TSC timestamps may diverge by up to 1 jiffy from
- * 'real time' but nothing will break.
- * The most frequent case is that the CPU is "woken" from a halt
- * state by the timer interrupt itself, so we get 0 error. In the
- * rare cases where a driver would "wake" the CPU and request a
- * timestamp, the maximum error is < 1 jiffy. But timestamps are
- * still perfectly ordered.
- * Note that the TSC counter will be reset if APM suspends
- * to disk; this won't break the kernel, though, 'cuz we're
- * smart. See arch/i386/kernel/apm.c.
- */
- /*
- * Firstly we have to do a CPU check for chips with
- * a potentially buggy TSC. At this point we haven't run
- * the ident/bugs checks so we must run this hook as it
- * may turn off the TSC flag.
- *
- * NOTE: this doesn't yet handle SMP 486 machines where only
- * some CPU's have a TSC. Thats never worked and nobody has
- * moaned if you have the only one in the world - you fix it!
- */
-
- count2 = LATCH; /* initialize counter for mark_offset_tsc() */
-
- if (cpu_has_tsc) {
- unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled() && hpet_use_timer) {
- unsigned long result, remain;
- printk("Using TSC for gettimeofday\n");
- tsc_quotient = calibrate_tsc_hpet(NULL);
- timer_tsc.mark_offset = &mark_offset_tsc_hpet;
- /*
- * Math to calculate hpet to usec multiplier
- * Look for the comments at get_offset_tsc_hpet()
- */
- ASM_DIV64_REG(result, remain, hpet_tick,
- 0, KERNEL_TICK_USEC);
- if (remain > (hpet_tick >> 1))
- result++; /* rounding the result */
-
- hpet_usec_quotient = result;
- } else
-#endif
- {
- tsc_quotient = calibrate_tsc();
- }
-
- if (tsc_quotient) {
- fast_gettimeoffset_quotient = tsc_quotient;
- use_tsc = 1;
- /*
- * We could be more selective here I suspect
- * and just enable this for the next intel chips ?
- */
- /* report CPU clock rate in Hz.
- * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
- * clock/second. Our precision is about 100 ppm.
- */
- { unsigned long eax=0, edx=1000;
- __asm__("divl %2"
- :"=a" (cpu_khz), "=d" (edx)
- :"r" (tsc_quotient),
- "0" (eax), "1" (edx));
- printk("Detected %u.%03u MHz processor.\n",
- cpu_khz / 1000, cpu_khz % 1000);
- }
- set_cyc2ns_scale(cpu_khz);
- return 0;
- }
- }
- return -ENODEV;
-}
-
-static int tsc_resume(void)
-{
- write_seqlock(&monotonic_lock);
- /* Assume this is the last mark offset time */
- rdtsc(last_tsc_low, last_tsc_high);
-#ifdef CONFIG_HPET_TIMER
- if (is_hpet_enabled() && hpet_use_timer)
- hpet_last = hpet_readl(HPET_COUNTER);
-#endif
- write_sequnlock(&monotonic_lock);
- return 0;
-}
-
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc. Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
-static int __init tsc_setup(char *str)
-{
- tsc_disable = 1;
- return 1;
-}
-#else
-static int __init tsc_setup(char *str)
-{
- printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
- "cannot disable TSC.\n");
- return 1;
-}
-#endif
-__setup("notsc", tsc_setup);
-
-
-
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
- .name = "tsc",
- .mark_offset = mark_offset_tsc,
- .get_offset = get_offset_tsc,
- .monotonic_clock = monotonic_clock_tsc,
- .delay = delay_tsc,
- .read_timer = read_timer_tsc,
- .resume = tsc_resume,
-};
-
-struct init_timer_opts __initdata timer_tsc_init = {
- .init = init_tsc,
- .opts = &timer_tsc,
-};
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 296355292c7c..07d6da36a825 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -28,19 +28,13 @@
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/nodemask.h>
+#include <linux/mmzone.h>
#include <asm/cpu.h>
static struct i386_cpu cpu_devices[NR_CPUS];
-int arch_register_cpu(int num){
- struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
- int node = cpu_to_node(num);
- if (node_online(node))
- parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
+int arch_register_cpu(int num)
+{
/*
* CPU0 cannot be offlined due to several
* restrictions and assumptions in kernel. This basically
@@ -50,57 +44,30 @@ int arch_register_cpu(int num){
if (!num)
cpu_devices[num].cpu.no_control = 1;
- return register_cpu(&cpu_devices[num].cpu, num, parent);
+ return register_cpu(&cpu_devices[num].cpu, num);
}
#ifdef CONFIG_HOTPLUG_CPU
void arch_unregister_cpu(int num) {
- struct node *parent = NULL;
-
-#ifdef CONFIG_NUMA
- int node = cpu_to_node(num);
- if (node_online(node))
- parent = &node_devices[node].node;
-#endif /* CONFIG_NUMA */
-
- return unregister_cpu(&cpu_devices[num].cpu, parent);
+ return unregister_cpu(&cpu_devices[num].cpu);
}
EXPORT_SYMBOL(arch_register_cpu);
EXPORT_SYMBOL(arch_unregister_cpu);
#endif /*CONFIG_HOTPLUG_CPU*/
-
-
-#ifdef CONFIG_NUMA
-#include <linux/mmzone.h>
-#include <asm/node.h>
-
-struct i386_node node_devices[MAX_NUMNODES];
-
static int __init topology_init(void)
{
int i;
+#ifdef CONFIG_NUMA
for_each_online_node(i)
- arch_register_node(i);
-
- for_each_present_cpu(i)
- arch_register_cpu(i);
- return 0;
-}
-
-#else /* !CONFIG_NUMA */
-
-static int __init topology_init(void)
-{
- int i;
+ register_one_node(i);
+#endif /* CONFIG_NUMA */
for_each_present_cpu(i)
arch_register_cpu(i);
return 0;
}
-#endif /* CONFIG_NUMA */
-
subsys_initcall(topology_init);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index dcc14477af1f..6820b8d643c7 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -11,7 +11,6 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'asm.s'.
*/
-#include <linux/config.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -28,6 +27,8 @@
#include <linux/utsname.h>
#include <linux/kprobes.h>
#include <linux/kexec.h>
+#include <linux/unwind.h>
+#include <linux/uaccess.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -40,22 +41,24 @@
#include <asm/processor.h>
#include <asm/system.h>
-#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/atomic.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/i387.h>
#include <asm/nmi.h>
-
+#include <asm/unwind.h>
#include <asm/smp.h>
#include <asm/arch_hooks.h>
#include <asm/kdebug.h>
+#include <asm/stacktrace.h>
#include <linux/module.h>
#include "mach_traps.h"
+int panic_on_unrecovered_nmi;
+
asmlinkage int system_call(void);
struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
@@ -92,6 +95,11 @@ asmlinkage void spurious_interrupt_bug(void);
asmlinkage void machine_check(void);
static int kstack_depth_to_print = 24;
+#ifdef CONFIG_STACK_UNWIND
+static int call_trace = 1;
+#else
+#define call_trace (-1)
+#endif
ATOMIC_NOTIFIER_HEAD(i386die_chain);
int register_die_notifier(struct notifier_block *nb)
@@ -99,13 +107,13 @@ int register_die_notifier(struct notifier_block *nb)
vmalloc_sync_all();
return atomic_notifier_chain_register(&i386die_chain, nb);
}
-EXPORT_SYMBOL(register_die_notifier);
+EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */
int unregister_die_notifier(struct notifier_block *nb)
{
return atomic_notifier_chain_unregister(&i386die_chain, nb);
}
-EXPORT_SYMBOL(unregister_die_notifier);
+EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */
static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
{
@@ -113,42 +121,16 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
p < (void *)tinfo + THREAD_SIZE - 3;
}
-/*
- * Print CONFIG_STACK_BACKTRACE_COLS address/symbol entries per line.
- */
-static inline int print_addr_and_symbol(unsigned long addr, char *log_lvl,
- int printed)
-{
- if (!printed)
- printk(log_lvl);
-
-#if CONFIG_STACK_BACKTRACE_COLS == 1
- printk(" [<%08lx>] ", addr);
-#else
- printk(" <%08lx> ", addr);
-#endif
- print_symbol("%s", addr);
-
- printed = (printed + 1) % CONFIG_STACK_BACKTRACE_COLS;
- if (printed)
- printk(" ");
- else
- printk("\n");
-
- return printed;
-}
-
static inline unsigned long print_context_stack(struct thread_info *tinfo,
unsigned long *stack, unsigned long ebp,
- char *log_lvl)
+ struct stacktrace_ops *ops, void *data)
{
unsigned long addr;
- int printed = 0; /* nr of entries already printed on current line */
#ifdef CONFIG_FRAME_POINTER
while (valid_stack_ptr(tinfo, (void *)ebp)) {
addr = *(unsigned long *)(ebp + 4);
- printed = print_addr_and_symbol(addr, log_lvl, printed);
+ ops->address(data, addr);
/*
* break out of recursive entries (such as
* end_of_stack_stop_unwind_function):
@@ -161,50 +143,160 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
while (valid_stack_ptr(tinfo, stack)) {
addr = *stack++;
if (__kernel_text_address(addr))
- printed = print_addr_and_symbol(addr, log_lvl, printed);
+ ops->address(data, addr);
}
#endif
- if (printed)
- printk("\n");
-
return ebp;
}
-static void show_trace_log_lvl(struct task_struct *task,
- unsigned long *stack, char *log_lvl)
+struct ops_and_data {
+ struct stacktrace_ops *ops;
+ void *data;
+};
+
+static asmlinkage int
+dump_trace_unwind(struct unwind_frame_info *info, void *data)
+{
+ struct ops_and_data *oad = (struct ops_and_data *)data;
+ int n = 0;
+
+ while (unwind(info) == 0 && UNW_PC(info)) {
+ n++;
+ oad->ops->address(oad->data, UNW_PC(info));
+ if (arch_unw_user_mode(info))
+ break;
+ }
+ return n;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *stack,
+ struct stacktrace_ops *ops, void *data)
{
- unsigned long ebp;
+ unsigned long ebp = 0;
if (!task)
task = current;
- if (task == current) {
- /* Grab ebp right from our regs */
- asm ("movl %%ebp, %0" : "=r" (ebp) : );
- } else {
- /* ebp is the last reg pushed by switch_to */
- ebp = *(unsigned long *) task->thread.esp;
+ if (call_trace >= 0) {
+ int unw_ret = 0;
+ struct unwind_frame_info info;
+ struct ops_and_data oad = { .ops = ops, .data = data };
+
+ if (regs) {
+ if (unwind_init_frame_info(&info, task, regs) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ } else if (task == current)
+ unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
+ else {
+ if (unwind_init_blocked(&info, task) == 0)
+ unw_ret = dump_trace_unwind(&info, &oad);
+ }
+ if (unw_ret > 0) {
+ if (call_trace == 1 && !arch_unw_user_mode(&info)) {
+ ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
+ UNW_PC(&info));
+ if (UNW_SP(&info) >= PAGE_OFFSET) {
+ ops->warning(data, "Leftover inexact backtrace:\n");
+ stack = (void *)UNW_SP(&info);
+ if (!stack)
+ return;
+ ebp = UNW_FP(&info);
+ } else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else if (call_trace >= 1)
+ return;
+ else
+ ops->warning(data, "Full inexact backtrace again:\n");
+ } else
+ ops->warning(data, "Inexact backtrace:\n");
+ }
+ if (!stack) {
+ unsigned long dummy;
+ stack = &dummy;
+ if (task && task != current)
+ stack = (unsigned long *)task->thread.esp;
}
+#ifdef CONFIG_FRAME_POINTER
+ if (!ebp) {
+ if (task == current) {
+ /* Grab ebp right from our regs */
+ asm ("movl %%ebp, %0" : "=r" (ebp) : );
+ } else {
+ /* ebp is the last reg pushed by switch_to */
+ ebp = *(unsigned long *) task->thread.esp;
+ }
+ }
+#endif
+
while (1) {
struct thread_info *context;
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
- ebp = print_context_stack(context, stack, ebp, log_lvl);
+ ebp = print_context_stack(context, stack, ebp, ops, data);
+ /* Should be after the line below, but somewhere
+ in early boot context comes out corrupted and we
+ can't reference it -AK */
+ if (ops->stack(data, "IRQ") < 0)
+ break;
stack = (unsigned long*)context->previous_esp;
if (!stack)
break;
- printk("%s =======================\n", log_lvl);
}
}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+ printk(data);
+ print_symbol(msg, symbol);
+ printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+ printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+ return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr)
+{
+ printk("%s [<%08lx>] ", (char *)data, addr);
+ print_symbol("%s\n", addr);
+}
+
+static struct stacktrace_ops print_trace_ops = {
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
+};
-void show_trace(struct task_struct *task, unsigned long * stack)
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long * stack, char *log_lvl)
{
- show_trace_log_lvl(task, stack, "");
+ dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
+ printk("%s =======================\n", log_lvl);
}
-static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
- char *log_lvl)
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+ unsigned long * stack)
+{
+ show_trace_log_lvl(task, regs, stack, "");
+}
+
+static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *esp, char *log_lvl)
{
unsigned long *stack;
int i;
@@ -225,13 +317,13 @@ static void show_stack_log_lvl(struct task_struct *task, unsigned long *esp,
printk("%08lx ", *stack++);
}
printk("\n%sCall Trace:\n", log_lvl);
- show_trace_log_lvl(task, esp, log_lvl);
+ show_trace_log_lvl(task, regs, esp, log_lvl);
}
void show_stack(struct task_struct *task, unsigned long *esp)
{
printk(" ");
- show_stack_log_lvl(task, esp, "");
+ show_stack_log_lvl(task, NULL, esp, "");
}
/*
@@ -241,7 +333,7 @@ void dump_stack(void)
{
unsigned long stack;
- show_trace(current, &stack);
+ show_trace(current, NULL, &stack);
}
EXPORT_SYMBOL(dump_stack);
@@ -261,8 +353,9 @@ void show_registers(struct pt_regs *regs)
ss = regs->xss & 0xffff;
}
print_modules();
- printk(KERN_EMERG "CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\n"
- "EFLAGS: %08lx (%s %.*s) \n",
+ printk(KERN_EMERG "CPU: %d\n"
+ KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n"
+ KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n",
smp_processor_id(), 0xffff & regs->xcs, regs->eip,
print_tainted(), regs->eflags, system_utsname.release,
(int)strcspn(system_utsname.version, " "),
@@ -283,16 +376,21 @@ void show_registers(struct pt_regs *regs)
*/
if (in_kernel) {
u8 __user *eip;
+ int code_bytes = 64;
+ unsigned char c;
printk("\n" KERN_EMERG "Stack: ");
- show_stack_log_lvl(NULL, (unsigned long *)esp, KERN_EMERG);
+ show_stack_log_lvl(NULL, regs, (unsigned long *)esp, KERN_EMERG);
printk(KERN_EMERG "Code: ");
eip = (u8 __user *)regs->eip - 43;
- for (i = 0; i < 64; i++, eip++) {
- unsigned char c;
-
+ if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
+ /* try starting at EIP */
+ eip = (u8 __user *)regs->eip;
+ code_bytes = 32;
+ }
+ for (i = 0; i < code_bytes; i++, eip++) {
if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) {
printk(" Bad EIP value.");
break;
@@ -308,35 +406,36 @@ void show_registers(struct pt_regs *regs)
static void handle_BUG(struct pt_regs *regs)
{
+ unsigned long eip = regs->eip;
unsigned short ud2;
- unsigned short line;
- char *file;
- char c;
- unsigned long eip;
-
- eip = regs->eip;
if (eip < PAGE_OFFSET)
- goto no_bug;
- if (__get_user(ud2, (unsigned short __user *)eip))
- goto no_bug;
+ return;
+ if (probe_kernel_address((unsigned short __user *)eip, ud2))
+ return;
if (ud2 != 0x0b0f)
- goto no_bug;
- if (__get_user(line, (unsigned short __user *)(eip + 2)))
- goto bug;
- if (__get_user(file, (char * __user *)(eip + 4)) ||
- (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
- file = "<bad filename>";
+ return;
printk(KERN_EMERG "------------[ cut here ]------------\n");
- printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
-no_bug:
- return;
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ do {
+ unsigned short line;
+ char *file;
+ char c;
+
+ if (probe_kernel_address((unsigned short __user *)(eip + 2),
+ line))
+ break;
+ if (__get_user(file, (char * __user *)(eip + 4)) ||
+ (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+ file = "<bad filename>";
- /* Here we know it was a BUG but file-n-line is unavailable */
-bug:
- printk(KERN_EMERG "Kernel BUG\n");
+ printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
+ return;
+ } while (0);
+#endif
+ printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
}
/* This is gone through when something in the kernel
@@ -426,11 +525,9 @@ void die(const char * str, struct pt_regs * regs, long err)
if (in_interrupt())
panic("Fatal exception in interrupt");
- if (panic_on_oops) {
- printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
- ssleep(5);
+ if (panic_on_oops)
panic("Fatal exception");
- }
+
oops_exit();
do_exit(SIGSEGV);
}
@@ -601,18 +698,24 @@ gp_in_kernel:
}
}
-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
- printk(KERN_EMERG "Uhhuh. NMI received. Dazed and confused, but trying "
- "to continue\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+ "CPU %d.\n", reason, smp_processor_id());
printk(KERN_EMERG "You probably have a hardware problem with your RAM "
"chips\n");
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
/* Clear and disable the memory parity error line. */
clear_mem_error(reason);
}
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+io_check_error(unsigned char reason, struct pt_regs * regs)
{
unsigned long i;
@@ -628,7 +731,8 @@ static void io_check_error(unsigned char reason, struct pt_regs * regs)
outb(reason, 0x61);
}
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
{
#ifdef CONFIG_MCA
/* Might actually be able to figure out what the guilty party
@@ -638,15 +742,18 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
return;
}
#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
+ printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
+ "CPU %d.\n", reason, smp_processor_id());
+ printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ if (panic_on_unrecovered_nmi)
+ panic("NMI: Not continuing");
+
+ printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
static DEFINE_SPINLOCK(nmi_print_lock);
-void die_nmi (struct pt_regs *regs, const char *msg)
+void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
{
if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
NOTIFY_STOP)
@@ -678,7 +785,7 @@ void die_nmi (struct pt_regs *regs, const char *msg)
do_exit(SIGSEGV);
}
-static void default_do_nmi(struct pt_regs * regs)
+static __kprobes void default_do_nmi(struct pt_regs * regs)
{
unsigned char reason = 0;
@@ -695,12 +802,12 @@ static void default_do_nmi(struct pt_regs * regs)
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
*/
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
+ if (nmi_watchdog_tick(regs, reason))
return;
- }
+ if (!do_nmi_callback(regs, smp_processor_id()))
#endif
- unknown_nmi_error(reason, regs);
+ unknown_nmi_error(reason, regs);
+
return;
}
if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
@@ -716,14 +823,7 @@ static void default_do_nmi(struct pt_regs * regs)
reassert_nmi();
}
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code)
{
int cpu;
@@ -733,25 +833,11 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
++nmi_count(cpu);
- if (!rcu_dereference(nmi_callback)(regs, cpu))
- default_do_nmi(regs);
+ default_do_nmi(regs);
nmi_exit();
}
-void set_nmi_callback(nmi_callback_t callback)
-{
- vmalloc_sync_all();
- rcu_assign_pointer(nmi_callback, callback);
-}
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-EXPORT_SYMBOL_GPL(unset_nmi_callback);
-
#ifdef CONFIG_KPROBES
fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
@@ -1091,20 +1177,6 @@ void __init trap_init_f00f_bug(void)
}
#endif
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
- int __d0, __d1; \
- __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
- "movw %4,%%dx\n\t" \
- "movl %%eax,%0\n\t" \
- "movl %%edx,%1" \
- :"=m" (*((long *) (gate_addr))), \
- "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
- :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
- "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
/*
* This needs to use 'idt_table' rather than 'idt', and
* thus use the _nonmapped_ version of the IDT, as the
@@ -1113,7 +1185,7 @@ do { \
*/
void set_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
}
/*
@@ -1121,22 +1193,22 @@ void set_intr_gate(unsigned int n, void *addr)
*/
static inline void set_system_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
}
static void __init set_trap_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
}
static void __init set_system_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
}
static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
{
- _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
}
@@ -1215,3 +1287,19 @@ static int __init kstack_setup(char *s)
return 1;
}
__setup("kstack=", kstack_setup);
+
+#ifdef CONFIG_STACK_UNWIND
+static int __init call_trace_setup(char *s)
+{
+ if (strcmp(s, "old") == 0)
+ call_trace = -1;
+ else if (strcmp(s, "both") == 0)
+ call_trace = 0;
+ else if (strcmp(s, "newfallback") == 0)
+ call_trace = 1;
+ else if (strcmp(s, "new") == 2)
+ call_trace = 2;
+ return 1;
+}
+__setup("call_trace=", call_trace_setup);
+#endif
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
new file mode 100644
index 000000000000..b8fa0a8b2e47
--- /dev/null
+++ b/arch/i386/kernel/tsc.c
@@ -0,0 +1,478 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/delay.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+
+int tsc_disable __cpuinitdata = 0;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+ printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+ "cannot disable TSC.\n");
+ return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+ tsc_disable = 1;
+
+ return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+ return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+ tsc_unstable = 1;
+}
+EXPORT_SYMBOL_GPL(mark_tsc_unstable);
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ * basic equation:
+ * ns = cycles / (freq / ns_per_sec)
+ * ns = cycles * (ns_per_sec / freq)
+ * ns = cycles * (10^9 / (cpu_khz * 10^3))
+ * ns = cycles * (10^6 / cpu_khz)
+ *
+ * Then we use scaling math (suggested by george@mvista.com) to get:
+ * ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * And since SC is a constant power of two, we can convert the div
+ * into a shift.
+ *
+ * We can use khz divisor instead of mhz to keep a better percision, since
+ * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ * (mathieu.desnoyers@polymtl.ca)
+ *
+ * -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+static unsigned long cyc2ns_scale __read_mostly;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+ cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+ return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+ unsigned long long this_offset;
+
+ /*
+ * in the NUMA case we dont use the TSC as they are not
+ * synchronized across all CPUs.
+ */
+#ifndef CONFIG_NUMA
+ if (!cpu_khz || check_tsc_unstable())
+#endif
+ /* no locking but a rare wrong value is not a big deal */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+ /* read the Time Stamp Counter: */
+ rdtscll(this_offset);
+
+ /* return the value in ns */
+ return cycles_2_ns(this_offset);
+}
+
+static unsigned long calculate_cpu_khz(void)
+{
+ unsigned long long start, end;
+ unsigned long count;
+ u64 delta64;
+ int i;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /* run 3 times to ensure the cache is warm */
+ for (i = 0; i < 3; i++) {
+ mach_prepare_counter();
+ rdtscll(start);
+ mach_countup(&count);
+ rdtscll(end);
+ }
+ /*
+ * Error: ECTCNEVERSET
+ * The CTC wasn't reliable: we got a hit on the very first read,
+ * or the CPU was so fast/slow that the quotient wouldn't fit in
+ * 32 bits..
+ */
+ if (count <= 1)
+ goto err;
+
+ delta64 = end - start;
+
+ /* cpu freq too fast: */
+ if (delta64 > (1ULL<<32))
+ goto err;
+
+ /* cpu freq too slow: */
+ if (delta64 <= CALIBRATE_TIME_MSEC)
+ goto err;
+
+ delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+ do_div(delta64,CALIBRATE_TIME_MSEC);
+
+ local_irq_restore(flags);
+ return (unsigned long)delta64;
+err:
+ local_irq_restore(flags);
+ return 0;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+ unsigned long cpu_khz_old = cpu_khz;
+
+ if (cpu_has_tsc) {
+ cpu_khz = calculate_cpu_khz();
+ tsc_khz = cpu_khz;
+ cpu_data[0].loops_per_jiffy =
+ cpufreq_scale(cpu_data[0].loops_per_jiffy,
+ cpu_khz_old, cpu_khz);
+ return 0;
+ } else
+ return -ENODEV;
+#else
+ return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+void __init tsc_init(void)
+{
+ if (!cpu_has_tsc || tsc_disable)
+ return;
+
+ cpu_khz = calculate_cpu_khz();
+ tsc_khz = cpu_khz;
+
+ if (!cpu_khz)
+ return;
+
+ printk("Detected %lu.%03lu MHz processor.\n",
+ (unsigned long)cpu_khz / 1000,
+ (unsigned long)cpu_khz % 1000);
+
+ set_cyc2ns_scale(cpu_khz);
+ use_tsc_delay();
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu)
+ cpufreq_get(cpu);
+
+ cpufreq_delayed_issched = 0;
+}
+
+/*
+ * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static inline void cpufreq_delayed_get(void)
+{
+ if (cpufreq_init && !cpufreq_delayed_issched) {
+ cpufreq_delayed_issched = 1;
+ printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
+ schedule_work(&cpufreq_delayed_get_work);
+ }
+}
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+static unsigned long cpu_khz_ref = 0;
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+
+ if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
+ write_seqlock_irq(&xtime_lock);
+
+ if (!ref_freq) {
+ if (!freq->old){
+ ref_freq = freq->new;
+ goto end;
+ }
+ ref_freq = freq->old;
+ loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+ cpu_khz_ref = cpu_khz;
+ }
+
+ if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
+ (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+ (val == CPUFREQ_RESUMECHANGE)) {
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+ cpu_data[freq->cpu].loops_per_jiffy =
+ cpufreq_scale(loops_per_jiffy_ref,
+ ref_freq, freq->new);
+
+ if (cpu_khz) {
+
+ if (num_online_cpus() == 1)
+ cpu_khz = cpufreq_scale(cpu_khz_ref,
+ ref_freq, freq->new);
+ if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+ tsc_khz = cpu_khz;
+ set_cyc2ns_scale(cpu_khz);
+ /*
+ * TSC based sched_clock turns
+ * to junk w/ cpufreq
+ */
+ mark_tsc_unstable();
+ }
+ }
+ }
+end:
+ if (val != CPUFREQ_RESUMECHANGE && val != CPUFREQ_SUSPENDCHANGE)
+ write_sequnlock_irq(&xtime_lock);
+
+ return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+ .notifier_call = time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+ int ret;
+
+ INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+ ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ if (!ret)
+ cpufreq_init = 1;
+
+ return ret;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* clock source code */
+
+static unsigned long current_tsc_khz = 0;
+static int tsc_update_callback(void);
+
+static cycle_t read_tsc(void)
+{
+ cycle_t ret;
+
+ rdtscll(ret);
+
+ return ret;
+}
+
+static struct clocksource clocksource_tsc = {
+ .name = "tsc",
+ .rating = 300,
+ .read = read_tsc,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 0, /* to be set */
+ .shift = 22,
+ .update_callback = tsc_update_callback,
+ .is_continuous = 1,
+};
+
+static int tsc_update_callback(void)
+{
+ int change = 0;
+
+ /* check to see if we should switch to the safe clocksource: */
+ if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+ clocksource_tsc.rating = 50;
+ clocksource_reselect();
+ change = 1;
+ }
+
+ /* only update if tsc_khz has changed: */
+ if (current_tsc_khz != tsc_khz) {
+ current_tsc_khz = tsc_khz;
+ clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+ clocksource_tsc.shift);
+ change = 1;
+ }
+
+ return change;
+}
+
+static int __init dmi_mark_tsc_unstable(struct dmi_system_id *d)
+{
+ printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
+ d->ident);
+ mark_tsc_unstable();
+ return 0;
+}
+
+/* List of systems that have known TSC problems */
+static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
+ {
+ .callback = dmi_mark_tsc_unstable,
+ .ident = "IBM Thinkpad 380XD",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
+ DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
+ },
+ },
+ {}
+};
+
+#define TSC_FREQ_CHECK_INTERVAL (10*MSEC_PER_SEC) /* 10sec in MS */
+static struct timer_list verify_tsc_freq_timer;
+
+/* XXX - Probably should add locking */
+static void verify_tsc_freq(unsigned long unused)
+{
+ static u64 last_tsc;
+ static unsigned long last_jiffies;
+
+ u64 now_tsc, interval_tsc;
+ unsigned long now_jiffies, interval_jiffies;
+
+
+ if (check_tsc_unstable())
+ return;
+
+ rdtscll(now_tsc);
+ now_jiffies = jiffies;
+
+ if (!last_jiffies) {
+ goto out;
+ }
+
+ interval_jiffies = now_jiffies - last_jiffies;
+ interval_tsc = now_tsc - last_tsc;
+ interval_tsc *= HZ;
+ do_div(interval_tsc, cpu_khz*1000);
+
+ if (interval_tsc < (interval_jiffies * 3 / 4)) {
+ printk("TSC appears to be running slowly. "
+ "Marking it as unstable\n");
+ mark_tsc_unstable();
+ return;
+ }
+
+out:
+ last_tsc = now_tsc;
+ last_jiffies = now_jiffies;
+ /* set us up to go off on the next interval: */
+ mod_timer(&verify_tsc_freq_timer,
+ jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL));
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+ /*
+ * Intel systems are normally all synchronized.
+ * Exceptions must mark TSC as unstable:
+ */
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return 0;
+
+ /* assume multi socket systems are not synchronized: */
+ return num_possible_cpus() > 1;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+
+ if (cpu_has_tsc && tsc_khz && !tsc_disable) {
+ /* check blacklist */
+ dmi_check_system(bad_tsc_dmi_table);
+
+ if (unsynchronized_tsc()) /* mark unstable if unsynced */
+ mark_tsc_unstable();
+ current_tsc_khz = tsc_khz;
+ clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+ clocksource_tsc.shift);
+ /* lower the rating if we already know its unstable: */
+ if (check_tsc_unstable())
+ clocksource_tsc.rating = 50;
+
+ init_timer(&verify_tsc_freq_timer);
+ verify_tsc_freq_timer.function = verify_tsc_freq;
+ verify_tsc_freq_timer.expires =
+ jiffies + msecs_to_jiffies(TSC_FREQ_CHECK_INTERVAL);
+ add_timer(&verify_tsc_freq_timer);
+
+ return clocksource_register(&clocksource_tsc);
+ }
+
+ return 0;
+}
+
+module_init(init_tsc_clocksource);
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index 00e0118e717c..8355d8d87d18 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -31,7 +31,6 @@
*/
#include <linux/capability.h>
-#include <linux/config.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/sched.h>
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 8831303a473f..1e7ac1c44ddc 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -13,6 +13,12 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(phys_startup_32)
jiffies = jiffies_64;
+
+PHDRS {
+ text PT_LOAD FLAGS(5); /* R_E */
+ data PT_LOAD FLAGS(7); /* RWE */
+ note PT_NOTE FLAGS(4); /* R__ */
+}
SECTIONS
{
. = __KERNEL_START;
@@ -26,7 +32,7 @@ SECTIONS
KPROBES_TEXT
*(.fixup)
*(.gnu.warning)
- } = 0x9090
+ } :text = 0x9090
_etext = .; /* End of text section */
@@ -37,11 +43,18 @@ SECTIONS
RODATA
+ . = ALIGN(4);
+ __tracedata_start = .;
+ .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
+ *(.tracedata)
+ }
+ __tracedata_end = .;
+
/* writeable */
.data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
*(.data)
CONSTRUCTORS
- }
+ } :data
. = ALIGN(4096);
__nosave_begin = .;
@@ -64,6 +77,15 @@ SECTIONS
.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
_edata = .; /* End of data section */
+#ifdef CONFIG_STACK_UNWIND
+ . = ALIGN(4);
+ .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
+ __start_unwind = .;
+ *(.eh_frame)
+ __end_unwind = .;
+ }
+#endif
+
. = ALIGN(THREAD_SIZE); /* init_task */
.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
*(.data.init_task)
@@ -168,4 +190,6 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
+
+ NOTES
}
diff --git a/arch/i386/kernel/vsyscall-sysenter.S b/arch/i386/kernel/vsyscall-sysenter.S
index 3b62baa6a371..1a36d26e15eb 100644
--- a/arch/i386/kernel/vsyscall-sysenter.S
+++ b/arch/i386/kernel/vsyscall-sysenter.S
@@ -42,10 +42,10 @@ __kernel_vsyscall:
/* 7: align return point with nop's to make disassembly easier */
.space 7,0x90
- /* 14: System call restart point is here! (SYSENTER_RETURN - 2) */
+ /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
jmp .Lenter_kernel
/* 16: System call normal return point is here! */
- .globl SYSENTER_RETURN /* Symbol used by entry.S. */
+ .globl SYSENTER_RETURN /* Symbol used by sysenter.c */
SYSENTER_RETURN:
pop %ebp
.Lpop_ebp:
diff --git a/arch/i386/kernel/vsyscall.lds.S b/arch/i386/kernel/vsyscall.lds.S
index 98699ca6e52d..f66cd11adb72 100644
--- a/arch/i386/kernel/vsyscall.lds.S
+++ b/arch/i386/kernel/vsyscall.lds.S
@@ -7,9 +7,10 @@
SECTIONS
{
- . = VSYSCALL_BASE + SIZEOF_HEADERS;
+ . = VDSO_PRELINK + SIZEOF_HEADERS;
.hash : { *(.hash) } :text
+ .gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
.dynstr : { *(.dynstr) }
.gnu.version : { *(.gnu.version) }
@@ -20,7 +21,7 @@ SECTIONS
For the layouts to match, we need to skip more than enough
space for the dynamic symbol table et al. If this amount
is insufficient, ld -shared will barf. Just increase it here. */
- . = VSYSCALL_BASE + 0x400;
+ . = VDSO_PRELINK + 0x400;
.text : { *(.text) } :text =0x90909090
.note : { *(.note.*) } :text :note
OpenPOWER on IntegriCloud