From c41b93fb8551148a93d3bba870365e8489317f02 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 8 Feb 2011 23:41:35 +0100 Subject: ACPI / PM: Drop acpi_restore_state_mem() The function acpi_restore_state_mem() has never been and most likely never will be used, so remove it. Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/sleep.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 68d1537b8c81..c27a483094b1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -110,14 +110,6 @@ int acpi_save_state_mem(void) return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - /** * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation * -- cgit v1.2.1 From f1a2003e22f6b50ea21f7f4b38b38c5ebc9c8017 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 8 Feb 2011 23:42:22 +0100 Subject: ACPI / PM: Merge do_suspend_lowlevel() into acpi_save_state_mem() The function do_suspend_lowlevel() is specific to x86 and defined in assembly code, so it should be called from the x86 low-level suspend code rather than from acpi_suspend_enter(). Merge do_suspend_lowlevel() into the x86's acpi_save_state_mem() and change the name of the latter to acpi_suspend_lowlevel(), so that the function's purpose is better reflected by its name. Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/sleep.c | 5 +++-- arch/x86/kernel/acpi/sleep.h | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index c27a483094b1..5f1b747f6ef1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -29,14 +29,14 @@ static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. * * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; @@ -107,6 +107,7 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1df..31ce13f20297 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -14,3 +14,5 @@ extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); -- cgit v1.2.1 From fc66c5210ec2539e800e87d7b3a985323c7be96e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sat, 19 Mar 2011 18:20:05 +0100 Subject: perf, x86: Fix Intel fixed counters base initialization The following patch solves the problems introduced by Robert's commit 41bf498 and reported by Arun Sharma. This commit gets rid of the base + index notation for reading and writing PMU msrs. The problem is that for fixed counters, the new calculation for the base did not take into account the fixed counter indexes, thus all fixed counters were read/written from fixed counter 0. Although all fixed counters share the same config MSR, they each have their own counter register. Without: $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds 242202299 unhalted_core_cycles (0.00% scaling, ena=1000790892, run=1000790892) 2389685946 instructions_retired (0.00% scaling, ena=1000790892, run=1000790892) 49473 baclears (0.00% scaling, ena=1000790892, run=1000790892) With: $ task -e unhalted_core_cycles -e instructions_retired -e baclears noploop 1 noploop for 1 seconds 2392703238 unhalted_core_cycles (0.00% scaling, ena=1000840809, run=1000840809) 2389793744 instructions_retired (0.00% scaling, ena=1000840809, run=1000840809) 47863 baclears (0.00% scaling, ena=1000840809, run=1000840809) Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ming.m.lin@intel.com Cc: robert.richter@amd.com Cc: asharma@fb.com Cc: perfmon2-devel@lists.sf.net LKML-Reference: <20110319172005.GB4978@quad> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e8dbe179587f..ec46eea0c4ed 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -912,7 +912,7 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->event_base = 0; } else if (hwc->idx >= X86_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); -- cgit v1.2.1 From 885b976fada5bc6595a9fd3e67e3cb1a3d11f50b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 21 Feb 2011 13:54:41 +0800 Subject: ACPI, APEI, Add ERST record ID cache APEI ERST firmware interface and implementation has no multiple users in mind. For example, if there is four records in storage with ID: 1, 2, 3 and 4, if two ERST readers enumerate the records via GET_NEXT_RECORD_ID as follow, reader 1 reader 2 1 2 3 4 -1 -1 where -1 signals there is no more record ID. Reader 1 has no chance to check record 2 and 4, while reader 2 has no chance to check record 1 and 3. And any other GET_NEXT_RECORD_ID will return -1, that is, other readers will has no chance to check any record even they are not cleared by anyone. This makes raw GET_NEXT_RECORD_ID not suitable for used by multiple users. To solve the issue, an in-memory ERST record ID cache is designed and implemented. When enumerating record ID, the ID returned by GET_NEXT_RECORD_ID is added into cache in addition to be returned to caller. So other readers can check the cache to get all record ID available. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 42 ++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a5..83930deec3c6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ -- cgit v1.2.1 From cbb84c4cc1ad0ab8faaffd899ccc9b14a88c91be Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 22 Mar 2011 15:24:54 +0600 Subject: x86, mpparse: Move check_slot into CONFIG_X86_IO_APIC context When CONFIG_X86_MPPARSE=y and CONFIG_X86_IO_APIC=n, then we get the following warning: arch/x86/kernel/mpparse.c:723: warning: 'check_slot' defined but not used So, put check_slot into CONFIG_X86_IO_APIC context. Its only called from CONFIG_X86_IO_APIC=y context. Signed-off-by: Rakib Mullick LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6f789a887c06..5a532ce646bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) *nr_m_spare += 1; } } -#else /* CONFIG_X86_IO_APIC */ -static -inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} -#endif /* CONFIG_X86_IO_APIC */ static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) @@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) return ret; } +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, -- cgit v1.2.1 From 375906f8765e131a4a159b1ffebf78c15db7b3bf Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:14 -0400 Subject: x86: mark associated mm when running a task in 32 bit compatibility mode This patch simply follows the same practice as for setting the TIF_IA32 flag. In particular, an mm is marked as holding 32-bit tasks when a 32-bit binary is exec'ed. Both ELF and a.out formats are updated. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- arch/x86/kernel/process_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..6c9dd922ac0d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -501,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -516,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } -- cgit v1.2.1 From f3c6ea1b06c71b43f751b36bd99345369fe911af Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:15:54 +0100 Subject: x86: Use syscore_ops instead of sysdev classes and sysdevs Some subsystems in the x86 tree need to carry out suspend/resume and shutdown operations with one CPU on-line and interrupts disabled and they define sysdev classes and sysdevs or sysdev drivers for this purpose. This leads to unnecessarily complicated code and excessive memory usage, so switch them to using struct syscore_ops objects for this purpose instead. Generally, there are three categories of subsystems that use sysdevs for implementing PM operations: (1) subsystems whose suspend/resume callbacks ignore their arguments entirely (the majority), (2) subsystems whose suspend/resume callbacks use their struct sys_device argument, but don't really need to do that, because they can be implemented differently in an arguably simpler way (io_apic.c), and (3) subsystems whose suspend/resume callbacks use their struct sys_device argument, but the value of that argument is always the same and could be ignored (microcode_core.c). In all of these cases the subsystems in question may be readily converted to using struct syscore_ops objects for power management and shutdown. Signed-off-by: Rafael J. Wysocki Reviewed-by: Thomas Gleixner Acked-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 26 +++-------- arch/x86/kernel/apic/apic.c | 33 ++++---------- arch/x86/kernel/apic/io_apic.c | 97 +++++++++++++++++++--------------------- arch/x86/kernel/cpu/mcheck/mce.c | 21 +++++---- arch/x86/kernel/cpu/mtrr/main.c | 10 ++--- arch/x86/kernel/i8237.c | 30 +++---------- arch/x86/kernel/i8259.c | 33 +++++--------- arch/x86/kernel/microcode_core.c | 34 ++++++-------- arch/x86/kernel/pci-gart_64.c | 32 +++---------- 9 files changed, 116 insertions(+), 200 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e11c8134158..246d727b65b7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -1260,7 +1260,7 @@ static void disable_iommus(void) * disable suspend until real resume implemented */ -static int amd_iommu_resume(struct sys_device *dev) +static void amd_iommu_resume(void) { struct amd_iommu *iommu; @@ -1276,11 +1276,9 @@ static int amd_iommu_resume(struct sys_device *dev) */ amd_iommu_flush_all_devices(); amd_iommu_flush_all_domains(); - - return 0; } -static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) +static int amd_iommu_suspend(void) { /* disable IOMMUs to go out of the way for BIOS */ disable_iommus(); @@ -1288,17 +1286,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static struct sysdev_class amd_iommu_sysdev_class = { - .name = "amd_iommu", +static struct syscore_ops amd_iommu_syscore_ops = { .suspend = amd_iommu_suspend, .resume = amd_iommu_resume, }; -static struct sys_device device_amd_iommu = { - .id = 0, - .cls = &amd_iommu_sysdev_class, -}; - /* * This is the core init function for AMD IOMMU hardware in the system. * This function is called from the generic x86 DMA layer initialization @@ -1415,14 +1407,6 @@ static int __init amd_iommu_init(void) goto free; } - ret = sysdev_class_register(&amd_iommu_sysdev_class); - if (ret) - goto free; - - ret = sysdev_register(&device_amd_iommu); - if (ret) - goto free; - ret = amd_iommu_init_devices(); if (ret) goto free; @@ -1441,6 +1425,8 @@ static int __init amd_iommu_init(void) amd_iommu_init_notifier(); + register_syscore_ops(&amd_iommu_syscore_ops); + if (iommu_pass_through) goto out; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 966673f44141..fabf01eff771 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -2046,7 +2046,7 @@ static struct { unsigned int apic_thmr; } apic_pm_state; -static int lapic_suspend(struct sys_device *dev, pm_message_t state) +static int lapic_suspend(void) { unsigned long flags; int maxlvt; @@ -2084,23 +2084,21 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) return 0; } -static int lapic_resume(struct sys_device *dev) +static void lapic_resume(void) { unsigned int l, h; unsigned long flags; - int maxlvt; - int ret = 0; + int maxlvt, ret; struct IO_APIC_route_entry **ioapic_entries = NULL; if (!apic_pm_state.active) - return 0; + return; local_irq_save(flags); if (intr_remapping_enabled) { ioapic_entries = alloc_ioapic_entries(); if (!ioapic_entries) { WARN(1, "Alloc ioapic_entries in lapic resume failed."); - ret = -ENOMEM; goto restore; } @@ -2162,8 +2160,6 @@ static int lapic_resume(struct sys_device *dev) } restore: local_irq_restore(flags); - - return ret; } /* @@ -2171,17 +2167,11 @@ restore: * are needed on every CPU up until machine_halt/restart/poweroff. */ -static struct sysdev_class lapic_sysclass = { - .name = "lapic", +static struct syscore_ops lapic_syscore_ops = { .resume = lapic_resume, .suspend = lapic_suspend, }; -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; @@ -2189,16 +2179,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { - int error; - - if (!cpu_has_apic) - return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + if (cpu_has_apic) + register_syscore_ops(&lapic_syscore_ops); - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; + return 0; } /* local apic needs to resume before other devices access its registers. */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 180ca240e03c..68df09bba92e 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include @@ -2918,89 +2918,84 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; +static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +static void suspend_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); + if (!saved_data) + return; + + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + saved_data[i] = ioapic_read_entry(ioapic_id, i); +} + +static int ioapic_suspend(void) +{ + int ioapic_id; + + for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++) + suspend_ioapic(ioapic_id); return 0; } -static int ioapic_resume(struct sys_device *dev) +static void resume_ioapic(int ioapic_id) { - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; + struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; unsigned long flags; union IO_APIC_reg_00 reg_00; int i; - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; + if (!saved_data) + return; raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].apicid; - io_apic_write(dev->id, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic_id, 0); + if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { + reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; + io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); + for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) + ioapic_write_entry(ioapic_id, i, saved_data[i]); +} - return 0; +static void ioapic_resume(void) +{ + int ioapic_id; + + for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) + resume_ioapic(ioapic_id); } -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", +static struct syscore_ops ioapic_syscore_ops = { .suspend = ioapic_suspend, .resume = ioapic_resume, }; -static int __init ioapic_init_sysfs(void) +static int __init ioapic_init_ops(void) { - struct sys_device * dev; - int i, size, error; + int i; - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; + for (i = 0; i < nr_ioapics; i++) { + unsigned int size; - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] + size = nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } + ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL); + if (!ioapic_saved_data[i]) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); } + register_syscore_ops(&ioapic_syscore_ops); + return 0; } -device_initcall(ioapic_init_sysfs); +device_initcall(ioapic_init_ops); /* * Dynamic irq allocate and deallocation diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ab1122998dba..5a05ef63eb4a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1749,14 +1750,14 @@ static int mce_disable_error_reporting(void) return 0; } -static int mce_suspend(struct sys_device *dev, pm_message_t state) +static int mce_suspend(void) { return mce_disable_error_reporting(); } -static int mce_shutdown(struct sys_device *dev) +static void mce_shutdown(void) { - return mce_disable_error_reporting(); + mce_disable_error_reporting(); } /* @@ -1764,14 +1765,18 @@ static int mce_shutdown(struct sys_device *dev) * Only one CPU is active at this time, the others get re-added later using * CPU hotplug: */ -static int mce_resume(struct sys_device *dev) +static void mce_resume(void) { __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info)); - - return 0; } +static struct syscore_ops mce_syscore_ops = { + .suspend = mce_suspend, + .shutdown = mce_shutdown, + .resume = mce_resume, +}; + static void mce_cpu_restart(void *data) { del_timer_sync(&__get_cpu_var(mce_timer)); @@ -1808,9 +1813,6 @@ static void mce_enable_ce(void *all) } static struct sysdev_class mce_sysclass = { - .suspend = mce_suspend, - .shutdown = mce_shutdown, - .resume = mce_resume, .name = "machinecheck", }; @@ -2139,6 +2141,7 @@ static __init int mcheck_init_device(void) return err; } + register_syscore_ops(&mce_syscore_ops); register_hotcpu_notifier(&mce_cpu_notifier); misc_register(&mce_log_device); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index bebabec5b448..307dfbbf4a8e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -630,7 +631,7 @@ struct mtrr_value { static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; -static int mtrr_save(struct sys_device *sysdev, pm_message_t state) +static int mtrr_save(void) { int i; @@ -642,7 +643,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state) return 0; } -static int mtrr_restore(struct sys_device *sysdev) +static void mtrr_restore(void) { int i; @@ -653,12 +654,11 @@ static int mtrr_restore(struct sys_device *sysdev) mtrr_value[i].ltype); } } - return 0; } -static struct sysdev_driver mtrr_sysdev_driver = { +static struct syscore_ops mtrr_syscore_ops = { .suspend = mtrr_save, .resume = mtrr_restore, }; @@ -839,7 +839,7 @@ static int __init mtrr_init_finialize(void) * TBD: is there any system with such CPU which supports * suspend/resume? If no, we should remove the code. */ - sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); + register_syscore_ops(&mtrr_syscore_ops); return 0; } diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c index b42ca694dc68..8eeaa81de066 100644 --- a/arch/x86/kernel/i8237.c +++ b/arch/x86/kernel/i8237.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include @@ -21,7 +21,7 @@ * in asm/dma.h. */ -static int i8237A_resume(struct sys_device *dev) +static void i8237A_resume(void) { unsigned long flags; int i; @@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev) enable_dma(4); release_dma_lock(flags); - - return 0; } -static int i8237A_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class i8237_sysdev_class = { - .name = "i8237", - .suspend = i8237A_suspend, +static struct syscore_ops i8237_syscore_ops = { .resume = i8237A_resume, }; -static struct sys_device device_i8237A = { - .id = 0, - .cls = &i8237_sysdev_class, -}; - -static int __init i8237A_init_sysfs(void) +static int __init i8237A_init_ops(void) { - int error = sysdev_class_register(&i8237_sysdev_class); - if (!error) - error = sysdev_register(&device_i8237A); - return error; + register_syscore_ops(&i8237_syscore_ops); + return 0; } -device_initcall(i8237A_init_sysfs); +device_initcall(i8237A_init_ops); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index d9ca749c123b..65b8f5c2eebf 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -245,20 +245,19 @@ static void save_ELCR(char *trigger) trigger[1] = inb(0x4d1) & 0xDE; } -static int i8259A_resume(struct sys_device *dev) +static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); - return 0; } -static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } -static int i8259A_shutdown(struct sys_device *dev) +static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it @@ -266,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev) */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ - return 0; } -static struct sysdev_class i8259_sysdev_class = { - .name = "i8259", +static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; -static struct sys_device device_i8259A = { - .id = 0, - .cls = &i8259_sysdev_class, -}; - static void mask_8259A(void) { unsigned long flags; @@ -399,17 +391,12 @@ struct legacy_pic default_legacy_pic = { struct legacy_pic *legacy_pic = &default_legacy_pic; -static int __init i8259A_init_sysfs(void) +static int __init i8259A_init_ops(void) { - int error; - - if (legacy_pic != &default_legacy_pic) - return 0; + if (legacy_pic == &default_legacy_pic) + register_syscore_ops(&i8259_syscore_ops); - error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; + return 0; } -device_initcall(i8259A_init_sysfs); +device_initcall(i8259A_init_ops); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 87af68e0e1e1..5ed0ab549eb8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include @@ -438,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; } -static int mc_sysdev_resume(struct sys_device *dev) +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, +}; + +/** + * mc_bp_resume - Update boot CPU microcode during resume. + */ +static void mc_bp_resume(void) { - int cpu = dev->id; + int cpu = smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!cpu_online(cpu)) - return 0; - - /* - * All non-bootup cpus are still disabled, - * so only CPU 0 will apply ucode here. - * - * Moreover, there can be no concurrent - * updates from any other places at this point. - */ - WARN_ON(cpu != 0); - if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); - - return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, +static struct syscore_ops mc_syscore_ops = { + .resume = mc_bp_resume, }; static __cpuinit int @@ -542,6 +535,7 @@ static int __init microcode_init(void) if (error) return error; + register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); pr_info("Microcode Update Driver: v" MICROCODE_VERSION diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c01ffa5b9b87..82ada01625b9 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -589,7 +589,7 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc) aperture_alloc = aper_alloc; } -static void gart_fixup_northbridges(struct sys_device *dev) +static void gart_fixup_northbridges(void) { int i; @@ -613,33 +613,20 @@ static void gart_fixup_northbridges(struct sys_device *dev) } } -static int gart_resume(struct sys_device *dev) +static void gart_resume(void) { pr_info("PCI-DMA: Resuming GART IOMMU\n"); - gart_fixup_northbridges(dev); + gart_fixup_northbridges(); enable_gart_translations(); - - return 0; } -static int gart_suspend(struct sys_device *dev, pm_message_t state) -{ - return 0; -} - -static struct sysdev_class gart_sysdev_class = { - .name = "gart", - .suspend = gart_suspend, +static struct syscore_ops gart_syscore_ops = { .resume = gart_resume, }; -static struct sys_device device_gart = { - .cls = &gart_sysdev_class, -}; - /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -650,7 +637,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i, error; + int i; pr_info("PCI-DMA: Disabling AGP.\n"); @@ -685,12 +672,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) agp_gatt_table = gatt; - error = sysdev_class_register(&gart_sysdev_class); - if (!error) - error = sysdev_register(&device_gart); - if (error) - panic("Could not register gart_sysdev -- " - "would corrupt data on next suspend"); + register_syscore_ops(&gart_syscore_ops); flush_gart(); -- cgit v1.2.1 From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 23 Mar 2011 16:43:29 -0700 Subject: crash_dump: export is_kdump_kernel to modules, consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn The Xen PV drivers in a crashed HVM guest can not connect to the dom0 backend drivers because both frontend and backend drivers are still in connected state. To run the connection reset function only in case of a crashdump, the is_kdump_kernel() function needs to be available for the PV driver modules. Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel() usable for modules. Leave 'elfcorehdr' as early_param(). This changes powerpc from __setup() to early_param(). It adds an address range check from x86 also on ia64 and powerpc. [akpm@linux-foundation.org: additional #includes] [akpm@linux-foundation.org: remove elfcorehdr_addr export] [akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes] Signed-off-by: Olaf Hering Cc: Russell King Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/crash_dump_32.c | 3 --- arch/x86/kernel/crash_dump_64.c | 3 --- arch/x86/kernel/e820.c | 1 + arch/x86/kernel/setup.c | 22 ---------------------- 4 files changed, 1 insertion(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5a..642f75a68cd5 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e09..afa64adb75ee 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include #include -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cdf5bfd9d4d5..3e2ef8425316 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 32bd87cbf982..5a0484a95ad6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -619,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; -- cgit v1.2.1 From 71f9e59800e5ad4e6b683348424c9fe54306cd43 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 24 Mar 2011 11:42:30 +0900 Subject: x86, dumpstack: Use %pB format specifier for stack trace Improve noreturn function entries in call traces: Before: Call Trace: [] panic+0x8c/0x18d [] deep01+0x0/0x38 [test_panic] <--- bad [] proc_file_write+0x73/0x8d [] proc_reg_write+0x8d/0xac [] vfs_write+0xa1/0xc5 [] sys_write+0x45/0x6c [] system_call_fastpath+0x16/0x1b After: Call Trace: [] panic+0x8c/0x18d [] panic_write+0x20/0x20 [test_panic] <--- good [] proc_file_write+0x73/0x8d [] proc_reg_write+0x8d/0xac [] vfs_write+0xa1/0xc5 [] sys_write+0x45/0x6c [] system_call_fastpath+0x16/0x1b Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <1300934550-21394-2-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999e2793590b..24d0479025f9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,7 +27,7 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk(" [<%p>] %s%pB\n", (void *) address, reliable ? "" : "? ", (void *) address); } -- cgit v1.2.1 From 242214f9c1eeaae40eca11e3b4d37bfce960a7cd Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Thu, 24 Mar 2011 23:36:25 +0300 Subject: perf, x86: P4 PMU - Read proper MSR register to catch unflagged overflows The read of a proper MSR register was missed and instead of counter the configration register was tested (it has ARCH_P4_UNFLAGGED_BIT always cleared) leading to unknown NMI hitting the system. As result the user may obtain "Dazed and confused, but trying to continue" message. Fix it by reading a proper MSR register. When an NMI happens on a P4, the perf nmi handler checks the configuration register to see if the overflow bit is set or not before taking appropriate action. Unfortunately, various P4 machines had a broken overflow bit, so a backup mechanism was implemented. This mechanism checked to see if the counter rolled over or not. A previous commit that implemented this backup mechanism was broken. Instead of reading the counter register, it used the configuration register to determine if the counter rolled over or not. Reading that bit would give incorrect results. This would lead to 'Dazed and confused' messages for the end user when using the perf tool (or if the nmi watchdog is running). The fix is to read the counter register before determining if the counter rolled over or not. Signed-off-by: Don Zickus Signed-off-by: Cyrill Gorcunov Cc: Lin Ming LKML-Reference: <4D8BAB49.3080701@openvz.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 3769ac822f96..d3d7b59841e5 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -777,6 +777,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) * the counter has reached zero value and continued counting before * real NMI signal was received: */ + rdmsrl(hwc->event_base, v); if (!(v & ARCH_P4_UNFLAGGED_BIT)) return 1; -- cgit v1.2.1 From 00a30b254b88d2d4f5af00835a9b7f70326def9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 22:53:10 +0100 Subject: x86: DT: Fix return condition in irq_create_of_mapping() The xlate() function returns 0 or a negative error code. Returning the error code blindly will be seen as an huge irq number by the calling function because irq_create_of_mapping() returns an unsigned value. Return 0 (NO_IRQ) as required. Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- arch/x86/kernel/devicetree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7a8cebc9ff29..9c91badb6ca9 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -65,7 +65,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return 0; ret = ih->xlate(ih, intspec, intsize, &virq, &type); if (ret) - return ret; + return 0; if (type == IRQ_TYPE_NONE) return virq; /* set the mask if it is different from current */ -- cgit v1.2.1 From 07611dbda5ccbd9a628f29686d62bafdd007db7b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 24 Mar 2011 21:41:57 +0100 Subject: x86: DT: Cleanup namespace and call irq_set_irq_type() unconditional That call escaped the name space cleanup. Fix it up. We really want to call there. The chip might have changed since the irq was setup initially. So let the core code and the chip decide what to do. The status is just an unreliable snapshot. Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior --- arch/x86/kernel/devicetree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 9c91badb6ca9..706a9fb46a58 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -68,9 +68,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return 0; if (type == IRQ_TYPE_NONE) return virq; - /* set the mask if it is different from current */ - if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) - set_irq_type(virq, type); + irq_set_irq_type(virq, type); return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); -- cgit v1.2.1 From 45daae575e08bbf7405c5a3633e956fa364d1b4f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 25 Mar 2011 10:24:23 +0100 Subject: perf, x86: Complain louder about BIOSen corrupting CPU/PMU state and continue Eric Dumazet reported that hardware PMU events do not work on his system, due to the BIOS corrupting PMU state: Performance Events: PEBS fmt0+, Core2 events, Broken BIOS detected, using software events only. [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR 186 is 43003c) Linus suggested that we continue in the face of such BIOS-induced CPU state corruption: http://lkml.org/lkml/2011/3/24/608 Such BIOSes will have to be fixed - Linux developers rely on a working and fully capable PMU and the BIOS interfering with the CPU's PMU state is simply not acceptable. So this patch changes perf to continue when it detects such BIOS interaction, some hardware events may be unreliable due to the BIOS writing and re-writing them - there's not much the kernel can do about that but to detect the corruption and report it. Reported-and-tested-by: Eric Dumazet Suggested-by: Linus Torvalds Acked-by: Peter Zijlstra Cc: Thomas Gleixner Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ec46eea0c4ed..eb00677ee2ae 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -500,12 +500,17 @@ static bool check_hw_exists(void) return true; bios_fail: - printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + /* + * We still allow the PMU driver to operate: + */ + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); - return false; + + return true; msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; } -- cgit v1.2.1 From 21431c2900a0b669080b5bfaae2a7d9d9c026e9b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kgdb,x86_64: fix compile warning found with sparse Fix sparse warning: arch/x86/kernel/kgdb.c:123:9: warning: switch with no cases Reported-by: Namhyung Kim Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index a4130005028a..3c2fb0f25abd 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -121,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, dbg_reg_def[regno].size); - switch (regno) { #ifdef CONFIG_X86_32 + switch (regno) { case GDB_SS: if (!user_mode_vm(regs)) *(unsigned long *)mem = __KERNEL_DS; @@ -135,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) case GDB_FS: *(unsigned long *)mem = 0xFFFF; break; -#endif } +#endif return dbg_reg_def[regno].name; } -- cgit v1.2.1 From ca444564a947034557a85357b3911d067cac4b8f Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 25 Mar 2011 15:20:14 +0100 Subject: x86: Stop including in two asm header files Stop including in x86 header files which don't need it. This will let the compiler complain when this header is not included by source files when it should, so that contributors can fix the problem before building on other architectures starts to fail. Credits go to Geert for the idea. Signed-off-by: Jean Delvare Cc: James E.J. Bottomley Cc: Geert Uytterhoeven Cc: Stephen Rothwell LKML-Reference: <20110325152014.297890ec@endymion.delvare> [ this also fixes an upstream build bug in drivers/media/rc/ite-cir.c ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/hw_nmi.c | 1 + arch/x86/kernel/apic/x2apic_uv_x.c | 1 + arch/x86/kernel/irq.c | 1 + arch/x86/kernel/reboot.c | 1 + 4 files changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c4e557a1ebb6..5260fe91bcb6 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3c289281394c..d2cf39bc5ecf 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 948a31eae75f..1cb0b9fc78dc 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d3ce37edb54d..08c44b08bf5b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.1 From 4ac5fc6a3e4d90120f292526bcaa5ee182a7411b Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 29 Mar 2011 16:34:32 +0800 Subject: x86, microcode: Unregister syscore_ops after microcode unloaded Currently, microcode doesn't unregister syscore_ops after it's unloaded. So if we modprobe then rmmod microcode, the stale microcode syscore_ops info will stay on syscore_ops_list. Later when we're trying to reboot/halt/shutdown the machine, kernel will panic on syscore_shutdown(). With the patch applied, I can reboot/halt/shutdown my machine successfully. Signed-off-by: Xiaotian Feng Cc: Tigran Aivazian Cc: Rafael J. Wysocki LKML-Reference: <1301387672-23661-1-git-send-email-dfeng@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 5ed0ab549eb8..f9242800bc84 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -550,6 +550,7 @@ static void __exit microcode_exit(void) microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); + unregister_syscore_ops(&mc_syscore_ops); get_online_cpus(); mutex_lock(µcode_mutex); -- cgit v1.2.1 From 86cc8dfc211695193a060a240ac9c9287606e5d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Mar 2011 00:09:01 +0200 Subject: x86: apb_timer: Fixup genirq fallout The lonely user of the internal interface was not in the coccinelle script. Reported-by: Randy Dunlap Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apb_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 1293c709ee85..cd1ffed4ee22 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -316,7 +316,7 @@ static void apbt_setup_irq(struct apbt_dev *adev) irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT); irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); /* APB timer irqs are set up as mp_irqs, timer is edge type */ - __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge"); + __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge"); if (system_state == SYSTEM_BOOTING) { if (request_irq(adev->irq, apbt_interrupt_handler, -- cgit v1.2.1 From 84ac7cdbdd0f04df6b96153f7a79127fd6e45467 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Mar 2011 15:38:12 -0700 Subject: x86, mtrr, pat: Fix one cpu getting out of sync during resume On laptops with core i5/i7, there were reports that after resume graphics workloads were performing poorly on a specific AP, while the other cpu's were ok. This was observed on a 32bit kernel specifically. Debug showed that the PAT init was not happening on that AP during resume and hence it contributing to the poor workload performance on that cpu. On this system, resume flow looked like this: 1. BP starts the resume sequence and we reinit BP's MTRR's/PAT early on using mtrr_bp_restore() 2. Resume sequence brings all AP's online 3. Resume sequence now kicks off the MTRR reinit on all the AP's. 4. For some reason, between point 2 and 3, we moved from BP to one of the AP's. My guess is that printk() during resume sequence is contributing to this. We don't see similar behavior with the 64bit kernel but there is no guarantee that at this point the remaining resume sequence (after AP's bringup) has to happen on BP. 5. set_mtrr() was assuming that we are still on BP and skipped the MTRR/PAT init on that cpu (because of 1 above) 6. But we were on an AP and this led to not reprogramming PAT on this cpu leading to bad performance. Fix this by doing unconditional mtrr_if->set_all() in set_mtrr() during MTRR/PAT init. This might be unnecessary if we are still running on BP. But it is of no harm and will guarantee that after resume, all the cpu's will be in sync with respect to the MTRR/PAT registers. Signed-off-by: Suresh Siddha LKML-Reference: <1301438292-28370-1-git-send-email-eric@anholt.net> Signed-off-by: Eric Anholt Tested-by: Keith Packard Cc: stable@kernel.org [v2.6.32+] Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 307dfbbf4a8e..929739a653d1 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -293,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ /* * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... + * + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * this cpu we still do mtrr_if->set_all(). During boot/resume, this + * is unnecessary if at this point we are still on the cpu that started + * the boot/resume sequence. But there is no guarantee that we are still + * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be + * sure that we are in sync with everyone else. */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) + else mtrr_if->set_all(); /* Wait for the others */ -- cgit v1.2.1 From cb6c8520f6f6bba7b7e1a6de3360a8edfd8243b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 30 Mar 2011 20:34:47 +0200 Subject: x86, amd-nb: Rename CPU PCI id define for F4 With increasing number of PCI function ids, add the PCI function id in the define name instead of its symbolic name in the BKDG for more clarity. This renames function 4 define. Signed-off-by: Borislav Petkov Cc: Jesse Barnes LKML-Reference: <20110330183447.GA3668@aftab> Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6801959a8b2a..4c39baa8facc 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -21,7 +21,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { EXPORT_SYMBOL(amd_nb_misc_ids); static struct pci_device_id amd_nb_link_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_LINK) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, {} }; -- cgit v1.2.1 From 818987e9a19c52240ba9b1c20f28f047eef76072 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 31 Mar 2011 09:32:02 -0500 Subject: x86, UV: Fix kdump reboot After a crash dump on an SGI Altix UV system the crash kernel fails to cause a reboot. EFI mode is disabled in the kdump kernel, so only the reboot_type of BOOT_ACPI works. Signed-off-by: Cliff Wickman Cc: rja@sgi.com LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d2cf39bc5ecf..33b10a0fc095 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include DEFINE_PER_CPU(int, x2apic_extra_bits); @@ -811,4 +813,11 @@ void __init uv_system_init(void) /* register Legacy VGA I/O redirection handler */ pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; } -- cgit v1.2.1 From a4dd99250dc49031e6a92a895dbcc230a4832083 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 1 Apr 2011 07:15:14 -0700 Subject: rcu: create new rcu_access_index() and use in mce The MCE subsystem needs to sample an RCU-protected index outside of any protection for that index. If this was a pointer, we would use rcu_access_pointer(), but there is no corresponding rcu_access_index(). This commit therefore creates an rcu_access_index() and applies it to MCE. Signed-off-by: Paul E. McKenney Tested-by: Zdenek Kabelac --- arch/x86/kernel/cpu/mcheck/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5a05ef63eb4a..3385ea26f684 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1626,7 +1626,7 @@ out: static unsigned int mce_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_wait, wait); - if (rcu_dereference_check_mce(mcelog.next)) + if (rcu_access_index(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; -- cgit v1.2.1 From 4da9484bdece39ab0b098fa711e095e3e9fc8684 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 6 Apr 2011 13:10:02 -0700 Subject: x86, hibernate: Initialize mmu_cr4_features during boot Restore the initialization of mmu_cr4_features during boot, which was removed without comment in checkin e5f15b45ddf3afa2bbbb10c7ea34fb32b6de0a0e x86: Cleanup highmap after brk is concluded thereby breaking resume from hibernate. This restores previous functionality in approximately the same place, and corrects the reading of %cr4 on pre-CPUID hardware (%cr4 exists if and only if CPUID is supported.) However, part of the problem is that the hibernate suspend/resume sequence should manage the save/restore of %cr4 explicitly. Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Cc: Stefano Stabellini Cc: Yinghai Lu LKML-Reference: <201104020154.57136.rjw@sisk.pl> --- arch/x86/kernel/setup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 5a0484a95ad6..4be9b398470e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -976,6 +976,11 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + if (boot_cpu_data.cpuid_level >= 0) { + /* A CPU has %cr4 if and only if it has CPUID */ + mmu_cr4_features = read_cr4(); + } + #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, -- cgit v1.2.1 From 7d6b46707f2491a94f4bd3b4329d2d7f809e9368 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 15 Apr 2011 20:39:01 +0900 Subject: x86, NUMA: Fix fakenuma boot failure Currently, numa=fake boot parameter is broken. If it's used, kernel may panic due to devide by zero error depending on CPU configuration Call Trace: [] find_busiest_group+0x38c/0xd30 [] ? local_clock+0x6f/0x80 [] load_balance+0xa3/0x600 [] idle_balance+0xf3/0x180 [] schedule+0x722/0x7d0 [] ? wait_for_common+0x128/0x190 [] schedule_timeout+0x265/0x320 [] ? lock_release_holdtime+0x35/0x1a0 [] ? wait_for_common+0x128/0x190 [] ? __lock_release+0x9c/0x1d0 [] ? _raw_spin_unlock_irq+0x30/0x40 [] ? _raw_spin_unlock_irq+0x30/0x40 [] wait_for_common+0x130/0x190 [] ? try_to_wake_up+0x510/0x510 [] wait_for_completion+0x1d/0x20 [] kthread_create_on_node+0xac/0x150 [] ? process_scheduled_works+0x40/0x40 [] ? wait_for_common+0x4f/0x190 [] __alloc_workqueue_key+0x1a3/0x590 [] cpuset_init_smp+0x6b/0x7b [] kernel_init+0xc3/0x182 [] kernel_thread_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? start_kernel+0x400/0x400 [] ? gs_change+0x13/0x13 The divede by zero is caused by the following line, group->cpu_power==0: kernel/sched_fair.c::update_sg_lb_stats() /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; This regression was caused by commit e23bba6044 ("x86-64, NUMA: Unify emulated distance mapping") because it changes cpu -> node mapping in the process of dropping fake_physnodes(). old) all cpus are assinged node 0 now) cpus are assigned round robin (the logic is implemented by numa_init_array()) Note: The change in behavior only happens if the system doesn't have neither ACPI SRAT table nor AMD northbridge NUMA information. Round robin assignment doesn't work because init_numa_sched_groups_power() assumes all logical cpus in the same physical cpu share the same node (then it only accounts for group_first_cpu()), and the simple round robin breaks the above assumption. Thus, this patch implements a reassignment of node-ids if buggy firmware or numa emulation makes wrong cpu node map. Tt enforce all logical cpus in the same physical cpu share the same node. Signed-off-by: KOSAKI Motohiro Acked-by: Tejun Heo Cc: Yinghai Lu Cc: Brian Gerst Cc: Cyrill Gorcunov Cc: Shaohui Zheng Cc: David Rientjes Cc: H. Peter Anvin Link: http://lkml.kernel.org/r/20110415203928.1303.A69D9226@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c2871d3c71b6..8ed8908cc9f7 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,6 +312,26 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } +static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) +{ + int node1 = early_cpu_to_node(cpu1); + int node2 = early_cpu_to_node(cpu2); + + /* + * Our CPU scheduler assumes all logical cpus in the same physical cpu + * share the same node. But, buggy ACPI or NUMA emulation might assign + * them to different node. Fix it. + */ + if (node1 != node2) { + pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", + cpu1, node1, cpu2, node2, node2); + + numa_remove_cpu(cpu1); + numa_set_node(cpu1, node2); + numa_add_cpu(cpu1); + } +} + static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -320,6 +340,7 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); + check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -361,10 +382,12 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); + check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ -- cgit v1.2.1 From 5bbc097d890409d8eff4e3f1d26f11a9d6b7c07e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 15 Apr 2011 14:47:40 +0200 Subject: x86, amd: Disable GartTlbWlkErr when BIOS forgets it This patch disables GartTlbWlk errors on AMD Fam10h CPUs if the BIOS forgets to do is (or is just too old). Letting these errors enabled can cause a sync-flood on the CPU causing a reboot. The AMD BKDG recommends disabling GART TLB Wlk Error completely. This patch is the fix for https://bugzilla.kernel.org/show_bug.cgi?id=33012 on my machine. Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/20110415131152.GJ18463@8bytes.org Tested-by: Alexandre Demers Cc: Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3ecece0217ef..3532d3bf8105 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -615,6 +615,25 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* As a rule processors have APIC timer running in deep C states */ if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + + rdmsrl(MSR_AMD64_MCx_MASK(4), mask); + mask |= (1 << 10); + wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } #ifdef CONFIG_X86_32 -- cgit v1.2.1 From c34151a742d84ae65db2088ea30495063f697fbe Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:45 +0200 Subject: x86, gart: Set DISTLBWALKPRB bit always The DISTLBWALKPRB bit must be set for the GART because the gatt table is mapped UC. But the current code does not set the bit at boot when the BIOS setup the aperture correctly. Fix that by setting this bit when enabling the GART instead of the other places. Cc: Cc: Borislav Petkov Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-4-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 86d1ad4962a7..73fb469908c6 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -499,7 +499,7 @@ out: * Don't enable translation yet but enable GART IO and CPU * accesses and set DISTLBWALKPRB since GART table memory is UC. */ - u32 ctl = DISTLBWALKPRB | aper_order << 1; + u32 ctl = aper_order << 1; bus = amd_nb_bus_dev_ranges[i].bus; dev_base = amd_nb_bus_dev_ranges[i].dev_base; -- cgit v1.2.1 From 665d3e2af83c8fbd149534db8f57d82fa6fa6753 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 18 Apr 2011 15:45:46 +0200 Subject: x86, gart: Make sure GART does not map physmem above 1TB The GART can only map physical memory below 1TB. Make sure the gart driver in the kernel does not try to map memory above 1TB. Cc: Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1303134346-5805-5-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-gart_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 82ada01625b9..b117efd24f71 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -81,6 +81,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; -- cgit v1.2.1 From 83112e688f5f05dea1e63787db9a6c16b2887a1d Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Sat, 16 Apr 2011 02:27:53 +0200 Subject: perf, x86: Fix pre-defined cache-misses event for AMD family 15h cpus With AMD cpu family 15h a unit mask was introduced for the Data Cache Miss event (0x041/L1-dcache-load-misses). We need to enable bit 0 (first data cache miss or streaming store to a 64 B cache line) of this mask to proper count data cache misses. Now we set this bit for all families and models. In case a PMU does not implement a unit mask for event 0x041 the bit is ignored. Signed-off-by: Andre Przywara Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1302913676-14352-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 461f62bbd774..4e1613845b9f 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(L1D) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ -- cgit v1.2.1 From 855357a21744e488cbee23a47d2b124035160a87 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Sat, 16 Apr 2011 02:27:54 +0200 Subject: perf, x86: Fix AMD family 15h FPU event constraints Depending on the unit mask settings some FPU events may be scheduled only on cpu counter #3. This patch fixes this. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1302913676-14352-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 4e1613845b9f..cf4e369cea67 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -427,7 +427,9 @@ static __initconst const struct x86_pmu amd_pmu = { * * Exceptions: * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) * 0x00B FP PERF_CTL[3] * 0x00D FP PERF_CTL[3] * 0x023 DE PERF_CTL[2:0] @@ -448,6 +450,8 @@ static __initconst const struct x86_pmu amd_pmu = { * 0x0DF LS PERF_CTL[5:0] * 0x1D6 EX PERF_CTL[5:0] * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used */ static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); @@ -460,18 +464,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); static struct event_constraint * amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) { - unsigned int event_code = amd_get_event_code(&event->hw); + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); switch (event_code & AMD_EVENT_TYPE_MASK) { case AMD_EVENT_FP: switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; case 0x003: case 0x00B: case 0x00D: return &amd_f15_PMC3; - default: - return &amd_f15_PMC53; } + return &amd_f15_PMC53; case AMD_EVENT_LS: case AMD_EVENT_DC: case AMD_EVENT_EX_LS: -- cgit v1.2.1 From 19234c0819da0e043a02710488dfd9b242b42eba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Apr 2011 00:36:11 +0200 Subject: PM: Add missing syscore_suspend() and syscore_resume() calls Device suspend/resume infrastructure is used not only by the suspend and hibernate code in kernel/power, but also by APM, Xen and the kexec jump feature. However, commit 40dc166cb5dddbd36aa4ad11c03915ea (PM / Core: Introduce struct syscore_ops for core subsystems PM) failed to add syscore_suspend() and syscore_resume() calls to that code, which generally leads to breakage when the features in question are used. To fix this problem, add the missing syscore_suspend() and syscore_resume() calls to arch/x86/kernel/apm_32.c, kernel/kexec.c and drivers/xen/manage.c. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Ian Campbell --- arch/x86/kernel/apm_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0b4be431c620..adee12e0da1f 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,6 +228,7 @@ #include #include #include +#include #include #include @@ -1238,6 +1239,7 @@ static int suspend(int vetoable) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); @@ -1255,6 +1257,7 @@ static int suspend(int vetoable) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + syscore_resume(); sysdev_resume(); local_irq_enable(); @@ -1280,6 +1283,7 @@ static void standby(void) local_irq_disable(); sysdev_suspend(PMSG_SUSPEND); + syscore_suspend(); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1287,6 +1291,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + syscore_resume(); sysdev_resume(); local_irq_enable(); -- cgit v1.2.1 From 37f8527dbfd05af0f670aa02370d0c4cca7fbda6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 20 Apr 2011 19:19:10 -0700 Subject: Revert "x86, NUMA: Fix fakenuma boot failure" Andreas Herrmann reported that 7d6b46707f24 ("x86, NUMA: Fix fakenuma boot failure") causes certain physical NUMA topologies (for example AMD Magny-Cours) to move sibling cpus to a single node when in reality they are in separate domains. This may result in some nodes being completely void of cpus, which doesn't accurately represent the correct topology. The system will boot, but will have suboptimal NUMA performance. This commit was intended as a fix for NUMA emulation, but should not cause a regression for real NUMA machines as a side effect. ( There will be a separate fix for the numa-debug code, which will not affect physical topologies. ) Reported-by: Andreas Herrmann Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: Tejun Heo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1104201918110.12634@chino.kir.corp.google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8ed8908cc9f7..c2871d3c71b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -312,26 +312,6 @@ void __cpuinit smp_store_cpu_info(int id) identify_secondary_cpu(c); } -static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2) -{ - int node1 = early_cpu_to_node(cpu1); - int node2 = early_cpu_to_node(cpu2); - - /* - * Our CPU scheduler assumes all logical cpus in the same physical cpu - * share the same node. But, buggy ACPI or NUMA emulation might assign - * them to different node. Fix it. - */ - if (node1 != node2) { - pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n", - cpu1, node1, cpu2, node2, node2); - - numa_remove_cpu(cpu1); - numa_set_node(cpu1, node2); - numa_add_cpu(cpu1); - } -} - static void __cpuinit link_thread_siblings(int cpu1, int cpu2) { cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); @@ -340,7 +320,6 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2) cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2)); cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1)); - check_cpu_siblings_on_same_node(cpu1, cpu2); } @@ -382,12 +361,10 @@ void __cpuinit set_cpu_sibling_map(int cpu) per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpumask_set_cpu(i, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); } if (c->phys_proc_id == cpu_data(i).phys_proc_id) { cpumask_set_cpu(i, cpu_core_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(i)); - check_cpu_siblings_on_same_node(cpu, i); /* * Does this new cpu bringup a new core? */ -- cgit v1.2.1 From b2508e828d71baacd9a743dd48dcbf85d96affdd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Apr 2011 16:48:35 -0700 Subject: perf: Support Xeon E7's via the Westmere PMU driver There's a new model number public, 47, for Xeon E7 (aka Westmere EX). Signed-off-by: Andi Kleen Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1303429715-10202-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8fc2b2cee1da..586cced12d1f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1425,6 +1425,7 @@ static __init int intel_pmu_init(void) case 37: /* 32 nm nehalem, "Clarkdale" */ case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, -- cgit v1.2.1 From b52c55c6a25e4515b5e075a989ff346fc251ed09 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Apr 2011 08:44:38 +0200 Subject: x86, perf event: Turn off unstructured raw event access to offcore registers Andi Kleen pointed out that the Intel offcore support patches were merged without user-space tool support to the functionality: | | The offcore_msr perf kernel code was merged into 2.6.39-rc*, but the | user space bits were not. This made it impossible to set the extra mask | and actually do the OFFCORE profiling | Andi submitted a preliminary patch for user-space support, as an extension to perf's raw event syntax: | | Some raw events -- like the Intel OFFCORE events -- support additional | parameters. These can be appended after a ':'. | | For example on a multi socket Intel Nehalem: | | perf stat -e r1b7:20ff -a sleep 1 | | Profile the OFFCORE_RESPONSE.ANY_REQUEST with event mask REMOTE_DRAM_0 | that measures any access to DRAM on another socket. | But this kind of usability is absolutely unacceptable - users should not be expected to type in magic, CPU and model specific incantations to get access to useful hardware functionality. The proper solution is to expose useful offcore functionality via generalized events - that way users do not have to care which specific CPU model they are using, they can use the conceptual event and not some model specific quirky hexa number. We already have such generalization in place for CPU cache events, and it's all very extensible. "Offcore" events measure general DRAM access patters along various parameters. They are particularly useful in NUMA systems. We want to support them via generalized DRAM events: either as the fourth level of cache (after the last-level cache), or as a separate generalization category. That way user-space support would be very obvious, memory access profiling could be done via self-explanatory commands like: perf record -e dram ./myapp perf record -e dram-remote ./myapp ... to measure DRAM accesses or more expensive cross-node NUMA DRAM accesses. These generalized events would work on all CPUs and architectures that have comparable PMU features. ( Note, these are just examples: actual implementation could have more sophistication and more parameter - as long as they center around similarly simple usecases. ) Now we do not want to revert *all* of the current offcore bits, as they are still somewhat useful for generic last-level-cache events, implemented in this commit: e994d7d23a0b: perf: Fix LLC-* events on Intel Nehalem/Westmere But we definitely do not yet want to expose the unstructured raw events to user-space, until better generalization and usability is implemented for these hardware event features. ( Note: after generalization has been implemented raw offcore events can be supported as well: there can always be an odd event that is marginally useful but not useful enough to generalize. DRAM profiling is definitely *not* such a category so generalization must be done first. ) Furthermore, PERF_TYPE_RAW access to these registers was not intended to go upstream without proper support - it was a side-effect of the above e994d7d23a0b commit, not mentioned in the changelog. As v2.6.39 is nearing release we go for the simplest approach: disable the PERF_TYPE_RAW offcore hack for now, before it escapes into a released kernel and becomes an ABI. Once proper structure is implemented for these hardware events and users are offered usable solutions we can revisit this issue. Reported-by: Andi Kleen Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1302658203-4239-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index eed3673a8656..632e5dc9c9c0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -586,8 +586,12 @@ static int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } + /* + * Do not allow config1 (extended registers) to propagate, + * there's no sane user-space generalization yet: + */ if (attr->type == PERF_TYPE_RAW) - return x86_pmu_extra_regs(event->attr.config, event); + return 0; if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); -- cgit v1.2.1 From 1ea5a6afd95a4803900c97ed63a47a883ebe7b3e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 21 Apr 2011 11:03:21 -0400 Subject: perf, x86: P4 PMU - Don't forget to clear cpuc->active_mask on overflow It's not enough to simply disable event on overflow the cpuc->active_mask should be cleared as well otherwise counter may stall in "active" even in real being already disabled (which potentially may lead to the situation that user may not use this counter further). Don pointed out that: " I also noticed this patch fixed some unknown NMIs on a P4 when I stressed the box". Tested-by: Lin Ming Signed-off-by: Cyrill Gorcunov Acked-by: Don Zickus Signed-off-by: Don Zickus Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1303398203-2918-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index c2520e178d32..d1f77e2934a1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -947,7 +947,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; if (perf_event_overflow(event, 1, &data, regs)) - p4_pmu_disable_event(event); + x86_pmu_stop(event, 0); } if (handled) { -- cgit v1.2.1 From f4929bd37208540c2c6f416e9035ff1938f2dbc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 13:39:56 +0200 Subject: perf, x86: Update/fix Intel Nehalem cache events Change the Nehalem cache events to use retired memory instruction counters (similar to Westmere), this greatly improves the provided stats. Using: main () { int i; for (i = 0; i < 1000000000; i++) { asm("mov (%%rsp), %%rbx;" "mov %%rbx, (%%rsp);" : : : "rbx"); } } We find: $ perf stat --repeat 10 -e instructions:u -e l1-dcache-loads:u -e l1-dcache-stores:u ./loop_1b_loads+stores Performance counter stats for './loop_1b_loads+stores' (10 runs): 4,000,081,056 instructions:u # 0.000 IPC ( +- 0.000% ) 4,999,502,846 l1-dcache-loads:u ( +- 0.008% ) 1,000,034,832 l1-dcache-stores:u ( +- 0.000% ) 1.565184942 seconds time elapsed ( +- 0.005% ) The 5b is surprising - we'd expect 1b: $ perf stat --repeat 10 -e instructions:u -e r10b:u -e l1-dcache-stores:u ./loop_1b_loads+stores Performance counter stats for './loop_1b_loads+stores' (10 runs): 4,000,081,054 instructions:u # 0.000 IPC ( +- 0.000% ) 1,000,021,961 r10b:u ( +- 0.000% ) 1,000,030,951 l1-dcache-stores:u ( +- 0.000% ) 1.565055422 seconds time elapsed ( +- 0.003% ) Which this patch thus fixes. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt Cc: Stephane Eranian Cc: Lin Ming Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/n/tip-q9rtru7b7840tws75xzboapv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 586cced12d1f..43fa20b13817 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -391,12 +391,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids { [ C(L1D) ] = { [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ }, [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ -- cgit v1.2.1 From 18a073a3acd3a47fbb5e23333df7fad28d576345 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Apr 2011 13:24:33 +0200 Subject: perf, x86: Fix BTS condition Currently the x86 backend incorrectly assumes that any BRANCH_INSN with sample_period==1 is a BTS request. This is not true when we do frequency driven profiling such as 'perf record -e branches'. Solves this error: $ perf record -e branches ./array Error: sys_perf_event_open() syscall returned with 95 (Operation not supported). Signed-off-by: Peter Zijlstra Reported-by: Ingo Molnar Cc: "Metzger, Markus T" Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-rd2y4ct71hjawzz6fpvsy9hg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++-- arch/x86/kernel/cpu/perf_event_intel.c | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 632e5dc9c9c0..fac0654021b8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -613,8 +613,8 @@ static int x86_setup_perfctr(struct perf_event *event) /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ if (!x86_pmu.bts_active) return -EOPNOTSUPP; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 43fa20b13817..9194b0698d63 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -998,6 +998,9 @@ intel_bts_constraints(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; + if (event->attr.freq) + return NULL; + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); -- cgit v1.2.1 From ec75a71634dabe439db91c1ef51d5099f4493808 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 11:51:41 +0200 Subject: perf events, x86: Work around the Nehalem AAJ80 erratum On Nehalem CPUs the retired branch-misses event can be completely bogus, when there are no branch-misses occuring. When there are a lot of branch misses then the count is pretty accurate. Still, this leaves us with an event that over-counts a lot. Detect this erratum and work it around by using BR_MISP_EXEC.ANY events. These will also count speculated branches but still it's a lot more precise in practice than the architectural event. Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-yyfg0bxo9jsqxd6a0ovfny27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9194b0698d63..9ae4a2aa7398 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -25,7 +25,7 @@ struct intel_percore { /* * Intel PerfMon, used on Core and later. */ -static const u64 intel_perfmon_event_map[] = +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -1308,7 +1308,7 @@ static void intel_clovertown_quirks(void) * AJ106 could possibly be worked around by not allowing LBR * usage from PEBS, including the fixup. * AJ68 could possibly be worked around by always programming - * a pebs_event_reset[0] value and coping with the lost events. + * a pebs_event_reset[0] value and coping with the lost events. * * But taken together it might just make sense to not enable PEBS on * these chips. @@ -1412,6 +1412,18 @@ static __init int intel_pmu_init(void) x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + + if (ebx & 0x40) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + + pr_cont("erratum AAJ80 worked around, "); + } pr_cont("Nehalem events, "); break; -- cgit v1.2.1 From 2bce5daca28346f19c190dbdb5542c9fe3e8c6e6 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 27 Apr 2011 06:32:33 -0400 Subject: perf, x86, nmi: Move LVT un-masking into irq handlers It was noticed that P4 machines were generating double NMIs for each perf event. These extra NMIs lead to 'Dazed and confused' messages on the screen. I tracked this down to a P4 quirk that said the overflow bit had to be cleared before re-enabling the apic LVT mask. My first attempt was to move the un-masking inside the perf nmi handler from before the chipset NMI handler to after. This broke Nehalem boxes that seem to like the unmasking before the counters themselves are re-enabled. In order to keep this change simple for 2.6.39, I decided to just simply move the apic LVT un-masking to the beginning of all the chipset NMI handlers, with the exception of Pentium4's to fix the double NMI issue. Later on we can move the un-masking to later in the handlers to save a number of 'extra' NMIs on those particular chipsets. I tested this change on a P4 machine, an AMD machine, a Nehalem box, and a core2quad box. 'perf top' worked correctly along with various other small 'perf record' runs. Anything high stress breaks all the machines but that is a different problem. Thanks to various people for testing different versions of this patch. Reported-and-tested-by: Shaun Ruffell Signed-off-by: Don Zickus Cc: Cyrill Gorcunov Link: http://lkml.kernel.org/r/1303900353-10242-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar CC: Cyrill Gorcunov --- arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++-- arch/x86/kernel/cpu/perf_event_intel.c | 10 ++++++++++ arch/x86/kernel/cpu/perf_event_p4.c | 17 +++++++++++++---- 3 files changed, 33 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fac0654021b8..e638689279d3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1288,6 +1288,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) { /* @@ -1374,8 +1384,6 @@ perf_event_nmi_handler(struct notifier_block *self, return NOTIFY_DONE; } - apic_write(APIC_LVTPC, APIC_DM_NMI); - handled = x86_pmu.handle_irq(args->regs); if (!handled) return NOTIFY_DONE; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9ae4a2aa7398..e61539b07d2c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -933,6 +933,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); + /* + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This handler doesn't seem to have any issues with the unmasking + * so it was left at the top. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d1f77e2934a1..e93fcd55fae1 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -950,11 +950,20 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) x86_pmu_stop(event, 0); } - if (handled) { - /* p4 quirk: unmask it again */ - apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + if (handled) inc_irq_stat(apic_perf_irqs); - } + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } -- cgit v1.2.1 From 20443598d9bdfe3563f901e27fd482a3f5d3d231 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 27 Apr 2011 16:30:52 +0200 Subject: x86: devicetree: Configure IOAPIC pin only once We use io_apic_setup_irq_pin() in order to configure pin's interrupt number polarity and type. This is done on every irq_create_of_mapping() which happens for instance during pci enable calls. Level typed interrupts are masked by default, edge are unmasked. On the first ->xlate() call the level interrupt is configured and masked. The driver calls request_irq() and the line is unmasked. Lets assume the interrupt line is shared with another device and we call pci_enable_device() for this device. The ->xlate() configures the pin again and it is masked. request_irq() does not unmask the line because it _is_ already unmasked according to its internal state. So the interrupt will never be unmasked again. This patch is based on an earlier work by Torben Hohn and solves the problem by configuring the pin only once. Since all devices must agree on the same type and polarity there is no point in configuring the pin more than once. [ tglx: Split out the ce4100 part into a separate patch ] Cc: Torben Hohn Signed-off-by: Sebastian Andrzej Siewior Link: http://lkml.kernel.org/r/%3C20110427143052.GA15211%40linutronix.de%3E Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 10 +++++----- arch/x86/kernel/devicetree.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 68df09bba92e..45fd33d1fd3a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -128,8 +128,8 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr); +static int io_apic_setup_irq_pin(unsigned int irq, int node, + struct io_apic_irq_attr *attr); /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */ void mp_save_irq(struct mpc_intsrc *m) @@ -3570,7 +3570,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ -int +static int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) { struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node); @@ -3585,8 +3585,8 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr) return ret; } -static int io_apic_setup_irq_pin_once(unsigned int irq, int node, - struct io_apic_irq_attr *attr) +int io_apic_setup_irq_pin_once(unsigned int irq, int node, + struct io_apic_irq_attr *attr) { unsigned int id = attr->ioapic, pin = attr->ioapic_pin; int ret; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 706a9fb46a58..e90f08458e6b 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -391,7 +391,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); - return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr); + return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr); } static void __init ioapic_add_ofnode(struct device_node *np) -- cgit v1.2.1 From e20a2d205c05cef6b5783df339a7d54adeb50962 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 29 Apr 2011 17:47:43 -0400 Subject: x86, AMD: Fix APIC timer erratum 400 affecting K8 Rev.A-E processors Older AMD K8 processors (Revisions A-E) are affected by erratum 400 (APIC timer interrupts don't occur in C states greater than C1). This, for example, means that X86_FEATURE_ARAT flag should not be set for these parts. This addresses regression introduced by commit b87cf80af3ba4b4c008b4face3c68d604e1715c6 ("x86, AMD: Set ARAT feature on AMD processors") where the system may become unresponsive until external interrupt (such as keyboard input) occurs. This results, for example, in time not being reported correctly, lack of progress on the system and other lockups. Reported-by: Joerg-Volker Peetz Tested-by: Joerg-Volker Peetz Acked-by: Borislav Petkov Signed-off-by: Boris Ostrovsky Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1304113663-6586-1-git-send-email-ostr@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3532d3bf8105..bb9eb29a52dd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -698,7 +698,7 @@ cpu_dev_register(amd_cpu_dev); */ const int amd_erratum_400[] = - AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf), AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); EXPORT_SYMBOL_GPL(amd_erratum_400); -- cgit v1.2.1 From 7806a49ab625ebeb1709e5e87299b64932b807a7 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 2 May 2011 14:33:24 -0700 Subject: x86, reboot: Fix relocations in reboot_32.S The use of base for %ebx in this file is arbitrary, *except* that we also use it to compute the real-mode segment. Therefore, make it so that r_base really is the true address to which %ebx points. This resolves kernel bugzilla 33302. Reported-and-tested-by: Alexey Zaytsev Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/n/tip-08os5wi3yq1no0y4i5m4z7he@git.kernel.org --- arch/x86/kernel/reboot_32.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S index 29092b38d816..1d5c46df0d78 100644 --- a/arch/x86/kernel/reboot_32.S +++ b/arch/x86/kernel/reboot_32.S @@ -21,26 +21,26 @@ r_base = . /* Get our own relocated address */ call 1f 1: popl %ebx - subl $1b, %ebx + subl $(1b - r_base), %ebx /* Compute the equivalent real-mode segment */ movl %ebx, %ecx shrl $4, %ecx /* Patch post-real-mode segment jump */ - movw dispatch_table(%ebx,%eax,2),%ax - movw %ax, 101f(%ebx) - movw %cx, 102f(%ebx) + movw (dispatch_table - r_base)(%ebx,%eax,2),%ax + movw %ax, (101f - r_base)(%ebx) + movw %cx, (102f - r_base)(%ebx) /* Set up the IDT for real mode. */ - lidtl machine_real_restart_idt(%ebx) + lidtl (machine_real_restart_idt - r_base)(%ebx) /* * Set up a GDT from which we can load segment descriptors for real * mode. The GDT is not used in real mode; it is just needed here to * prepare the descriptors. */ - lgdtl machine_real_restart_gdt(%ebx) + lgdtl (machine_real_restart_gdt - r_base)(%ebx) /* * Load the data segment registers with 16-bit compatible values -- cgit v1.2.1