From eba9fe93a2959ec7f195c47c9db6ce7b5114ce1f Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Tue, 18 Mar 2008 15:24:32 -0500 Subject: [CPUFREQ] powernow-k8: improve error messages The most common error with powernow-k8 is an ACPI _PSS error caused either by failure to load the ACPI processor module or a bad parse of the _PSS object. Make the error message returned to the user in these situations more straightforward and easier to understand. -Mark Langsdorf Operating System Research Center AMD Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 46d4034d9f37..206791eb46e3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1127,12 +1127,23 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) * an UP version, and is deprecated by AMD. */ if (num_online_cpus() != 1) { - printk(KERN_ERR PFX "MP systems not supported by PSB BIOS structure\n"); +#ifndef CONFIG_ACPI_PROCESSOR + printk(KERN_ERR PFX "ACPI Processor support is required " + "for SMP systems but is absent. Please load the " + "ACPI Processor module before starting this " + "driver.\n"); +#else + printk(KERN_ERR PFX "Your BIOS does not provide ACPI " + "_PSS objects in a way that Linux understands. " + "Please report this to the Linux ACPI maintainers" + " and complain to your BIOS vendor.\n"); +#endif kfree(data); return -ENODEV; } if (pol->cpu != 0) { - printk(KERN_ERR PFX "No _PSS objects for CPU other than CPU0\n"); + printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " + "CPU0. Complain to your BIOS vendor.\n"); kfree(data); return -ENODEV; } -- cgit v1.2.1 From 667ad4f70110357e8f024e81741c7bd1d7906e7d Mon Sep 17 00:00:00 2001 From: maximilian attems Date: Thu, 8 May 2008 22:10:01 +0200 Subject: [CPUFREQ] Crusoe: longrun cpufreq module reports false min freq The longrun cpufreq module reports a false minimum frequency 3MHz on 300-600MHz Crusoe processor. This may be due to a calculation bug in the module. Original patch from Kaz Sasayama submitted as http://bugs.debian.org/468149 patch ported to x86 Cc: Kaz Sasayama Signed-off-by: maximilian attems Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longrun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index af4a867a097c..777a7ff075de 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -245,7 +245,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, if ((ecx > 95) || (ecx == 0) || (eax < ebx)) return -EIO; - edx = (eax - ebx) / (100 - ecx); + edx = ((eax - ebx) * 100) / (100 - ecx); *low_freq = edx * 1000; /* back to kHz */ dprintk("low frequency is %u kHz\n", *low_freq); -- cgit v1.2.1 From b6db80ee1331e7beaeb91b4b3d946dd16c72e388 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 18 May 2008 19:27:48 +0200 Subject: x86: fix setup of cyc2ns in tsc_64.c When the TSC is calibrated against the PIT due to the nonavailability of PMTIMER/HPET or due to SMI interference then the setup of the per CPU cyc2ns variables is skipped. This is unlikely to happen but it would definitely render sched_clock() unusable. This was introduced with commit 53d517cdbaac704352b3d0c10fecb99e0b54572e x86: scale cyc_2_nsec according to CPU frequency Update the per CPU cyc2ns variables in all exit pathes of tsc_calibrate. Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- arch/x86/kernel/tsc_64.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index fcc16e58609e..1784b8077a12 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -227,14 +227,14 @@ void __init tsc_calibrate(void) /* hpet or pmtimer available ? */ if (!hpet && !pm1 && !pm2) { printk(KERN_INFO "TSC calibrated against PIT\n"); - return; + goto out; } /* Check, whether the sampling was disturbed by an SMI */ if (tsc1 == ULONG_MAX || tsc2 == ULONG_MAX) { printk(KERN_WARNING "TSC calibration disturbed by SMI, " "using PIT calibration result\n"); - return; + goto out; } tsc2 = (tsc2 - tsc1) * 1000000L; @@ -255,6 +255,7 @@ void __init tsc_calibrate(void) tsc_khz = tsc2 / tsc1; +out: for_each_possible_cpu(cpu) set_cyc2ns_scale(tsc_khz, cpu); } -- cgit v1.2.1 From 9ccc906c97e34fd91dc6aaf5b69b52d824386910 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 May 2008 12:31:00 +0200 Subject: x86: distangle user disabled TSC from unstable tsc_enabled is set to 0 from the command line switch "notsc" and from the mark_tsc_unstable code. Seperate those functionalities and replace tsc_enable with tsc_disable. This makes also the native_sched_clock() decision when to use TSC understandable. Preparatory patch to solve the sched_clock() issue on 32 bit. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_32.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index e4790728b224..b087d691f165 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -14,7 +14,7 @@ #include "mach_timer.h" -static int tsc_enabled; +static int tsc_disabled; /* * On some systems the TSC frequency does not @@ -28,8 +28,8 @@ EXPORT_SYMBOL_GPL(tsc_khz); static int __init tsc_setup(char *str) { printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); - mark_tsc_unstable("user disabled TSC"); + "cannot disable TSC completely.\n"); + tsc_disabled = 1; return 1; } #else @@ -120,7 +120,7 @@ unsigned long long native_sched_clock(void) * very important for it to be as fast as the platform * can achive it. ) */ - if (unlikely(!tsc_enabled && !tsc_unstable)) + if (unlikely(tsc_disabled)) /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); @@ -322,7 +322,6 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; - tsc_enabled = 0; printk("Marking TSC unstable due to: %s.\n", reason); /* Can be called before registration */ if (clocksource_tsc.mult) @@ -336,7 +335,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d) { printk(KERN_NOTICE "%s detected: marking TSC unstable.\n", - d->ident); + d->ident); tsc_unstable = 1; return 0; } @@ -403,8 +402,11 @@ void __init tsc_init(void) { int cpu; - if (!cpu_has_tsc) + if (!cpu_has_tsc || tsc_disabled) { + /* Disable the TSC in case of !cpu_has_tsc */ + tsc_disabled = 1; return; + } cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; @@ -441,8 +443,6 @@ void __init tsc_init(void) if (check_tsc_unstable()) { clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; - } else - tsc_enabled = 1; - + } clocksource_register(&clocksource_tsc); } -- cgit v1.2.1 From 74dc51a3de06aa516e3b9fdc4017b2aeb38bf44b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 18 May 2008 22:17:59 +0200 Subject: x86: disable TSC for sched_clock() when calibration failed When the TSC calibration fails then TSC is still used in sched_clock(). Disable it completely in that case. Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- arch/x86/kernel/tsc_32.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index b087d691f165..068759db63dd 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -413,6 +413,11 @@ void __init tsc_init(void) if (!cpu_khz) { mark_tsc_unstable("could not calculate TSC khz"); + /* + * We need to disable the TSC completely in this case + * to prevent sched_clock() from using it. + */ + tsc_disabled = 1; return; } -- cgit v1.2.1 From 2584a82deed7196f48066f1b1a7fad4ec5bea961 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 20 May 2008 18:18:12 -0400 Subject: x86: don't read maxlvt before checking if APIC is mapped A check for unmapped apic was added before reading maxlvt but the early read of maxlvt wasn't removed. Signed-off-by: Chuck Ebbert Cc: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- arch/x86/kernel/apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5910020c3f24..0633cfd0dc29 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -534,7 +534,7 @@ int setup_profiling_timer(unsigned int multiplier) */ void clear_local_APIC(void) { - int maxlvt = lapic_get_maxlvt(); + int maxlvt; u32 v; /* APIC hasn't been mapped yet */ -- cgit v1.2.1 From 2ddfd20e7c55421435cbf95a5ed3dd6e423cf934 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 22 May 2008 10:37:48 +0200 Subject: namespacecheck: automated fixes Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4bc1be5d5472..08a30986d472 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -53,7 +53,7 @@ static cycle_t kvm_clock_read(void); * have elapsed since the hypervisor wrote the data. So we try to account for * that with system time */ -unsigned long kvm_get_wallclock(void) +static unsigned long kvm_get_wallclock(void) { u32 wc_sec, wc_nsec; u64 delta; @@ -86,7 +86,7 @@ unsigned long kvm_get_wallclock(void) return ts.tv_sec + 1; } -int kvm_set_wallclock(unsigned long now) +static int kvm_set_wallclock(unsigned long now) { return 0; } -- cgit v1.2.1 From db9f600b96c16bb3c7f094e294fbdd370226ad86 Mon Sep 17 00:00:00 2001 From: Miquel van Smoorenburg Date: Wed, 28 May 2008 10:31:25 +0200 Subject: x86: pci-dma.c: use __GFP_NO_OOM instead of __GFP_NORETRY On Wed, 2008-05-28 at 04:47 +0200, Andi Kleen wrote: > > So... why not just remove the setting of __GFP_NORETRY? Why is it > > wrong to oom-kill things in this case? > > When the 16MB zone overflows (which can be common in some workloads) > calling the OOM killer is pretty useless because it has barely any > real user data [only exception would be the "only 16MB" case Alan > mentioned]. Killing random processes in this case is bad. > > I think for 16MB __GFP_NORETRY is ok because there should be > nothing freeable in there so looping is useless. Only exception would be the > "only 16MB total" case again but I'm not sure 2.6 supports that at all > on x86. > > On the other hand d_a_c() does more allocations than just 16MB, especially > on 64bit and the other zones need different strategies. Okay, so how about this then ? Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index c5ef1af8e79d..069e843f0b93 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -397,9 +397,6 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dev->dma_mask == NULL) return NULL; - /* Don't invoke OOM killer */ - gfp |= __GFP_NORETRY; - #ifdef CONFIG_X86_64 /* Why <=? Even when the mask is smaller than 4GB it is often larger than 16MB and in this case we have a chance of @@ -410,7 +407,9 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, #endif again: - page = dma_alloc_pages(dev, gfp, get_order(size)); + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ + page = dma_alloc_pages(dev, + (gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size)); if (page == NULL) return NULL; -- cgit v1.2.1 From f529626a86d61897862aa1bbbb4537773209238e Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 29 May 2008 00:30:21 -0700 Subject: suspend-vs-iommu: prevent suspend if we could not resume iommu/gart support misses suspend/resume code, which can do bad stuff, including memory corruption on resume. Prevent system suspend in case we would be unable to resume. Signed-off-by: Pavel Machek Tested-by: Patrick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d1695f..aa8ec928caa8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) return aper_base; } +static int gart_resume(struct sys_device *dev) +{ + return 0; +} + +static int gart_suspend(struct sys_device *dev, pm_message_t state) +{ + return -EINVAL; +} + +static struct sysdev_class gart_sysdev_class = { + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, + +}; + +static struct sys_device device_gart = { + .id = 0, + .cls = &gart_sysdev_class, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i; + int i, error; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } + + error = sysdev_class_register(&gart_sysdev_class); + if (!error) + error = sysdev_register(&device_gart); + if (error) + panic("Could not register gart_sysdev -- would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", -- cgit v1.2.1 From fb3bbd6a663fe972611676381adc4c60ddfe61ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 22 May 2008 18:22:30 -0700 Subject: x86: fix APIC warning on 32bit v2 for http://bugzilla.kernel.org/show_bug.cgi?id=10613 BIOS bug, APIC version is 0 for CPU#0! fixing up to 0x10. (tell your hw vendor) v2: fix 64 bit compilation Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Gabriel C Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c49ebcc6c41e..33c5216fd3e1 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -242,12 +242,19 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) static void __cpuinit acpi_register_lapic(int id, u8 enabled) { + unsigned int ver = 0; + if (!enabled) { ++disabled_cpus; return; } - generic_processor_info(id, 0); +#ifdef CONFIG_X86_32 + if (boot_cpu_physical_apicid != -1U) + ver = apic_version[boot_cpu_physical_apicid]; +#endif + + generic_processor_info(id, ver); } static int __init @@ -767,8 +774,13 @@ static void __init acpi_register_lapic_address(unsigned long address) mp_lapic_addr = address; set_fixmap_nocache(FIX_APIC_BASE, address); - if (boot_cpu_physical_apicid == -1U) + if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); +#ifdef CONFIG_X86_32 + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); +#endif + } } static int __init early_acpi_parse_madt_lapic_addr_ovr(void) -- cgit v1.2.1 From deef325086c3897393b8f7d6bccd03405244fe18 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 May 2008 15:44:38 +0200 Subject: x86: disable preemption in native_smp_prepare_cpus Priit Laes reported the following warning: Call Trace: [] warn_on_slowpath+0x51/0x63 [] sys_ioctl+0x2d/0x5d [] _spin_lock+0xe/0x24 [] task_rq_lock+0x3d/0x73 [] set_cpu_sibling_map+0x336/0x350 [] read_apic_id+0x30/0x62 [] verify_local_APIC+0x90/0x138 [] native_smp_prepare_cpus+0x1f9/0x305 [] kernel_init+0x59/0x2d9 [] _spin_unlock_irq+0x11/0x2b [] child_rip+0xa/0x12 [] kernel_init+0x0/0x2d9 [] child_rip+0x0/0x12 fix this by generally disabling preemption in native_smp_prepare_cpus(). Reported-and-bisected-by: Priit Laes Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/smpboot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38988491c622..56078d61c793 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1190,6 +1190,7 @@ static void __init smp_cpu_index_default(void) */ void __init native_smp_prepare_cpus(unsigned int max_cpus) { + preempt_disable(); nmi_watchdog_default(); smp_cpu_index_default(); current_cpu_data = boot_cpu_data; @@ -1206,7 +1207,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); disable_smp(); - return; + goto out; } preempt_disable(); @@ -1246,6 +1247,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); +out: + preempt_enable(); } /* * Early setup to make printk work. -- cgit v1.2.1 From e8a496ac8cd00cabbdaa373db4818a9ad19a1c5a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 23 May 2008 16:26:37 -0700 Subject: x86: fix broken math-emu with lazy allocation of fpu area Fix the math emulation that got broken with the recent lazy allocation of FPU area. init_fpu() need to be added for the math-emulation path aswell for the FPU area allocation. math emulation enabled kernel booted fine with this, in the presence of "no387 nofxsr" boot param. Signed-off-by: Suresh Siddha Cc: hpa@zytor.com Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- arch/x86/kernel/i387.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e03cc952f233..eb9ddd8efb82 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -56,6 +56,11 @@ void __cpuinit mxcsr_feature_mask_init(void) void __init init_thread_xstate(void) { + if (!HAVE_HWFP) { + xstate_size = sizeof(struct i387_soft_struct); + return; + } + if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -94,7 +99,7 @@ void __cpuinit fpu_init(void) int init_fpu(struct task_struct *tsk) { if (tsk_used_math(tsk)) { - if (tsk == current) + if (HAVE_HWFP && tsk == current) unlazy_fpu(tsk); return 0; } @@ -109,6 +114,15 @@ int init_fpu(struct task_struct *tsk) return -ENOMEM; } +#ifdef CONFIG_X86_32 + if (!HAVE_HWFP) { + memset(tsk->thread.xstate, 0, xstate_size); + finit(); + set_stopped_child_used_math(tsk); + return 0; + } +#endif + if (cpu_has_fxsr) { struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; @@ -330,13 +344,13 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - if (!HAVE_HWFP) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); - ret = init_fpu(target); if (ret) return ret; + if (!HAVE_HWFP) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + if (!cpu_has_fxsr) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fsave, 0, @@ -360,15 +374,15 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, struct user_i387_ia32_struct env; int ret; - if (!HAVE_HWFP) - return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); - ret = init_fpu(target); if (ret) return ret; set_stopped_child_used_math(target); + if (!HAVE_HWFP) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + if (!cpu_has_fxsr) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fsave, 0, -1); @@ -474,18 +488,18 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) int restore_i387_ia32(struct _fpstate_ia32 __user *buf) { int err; + struct task_struct *tsk = current; - if (HAVE_HWFP) { - struct task_struct *tsk = current; - + if (HAVE_HWFP) clear_fpu(tsk); - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + if (HAVE_HWFP) { if (cpu_has_fxsr) err = restore_i387_fxsave(buf); else -- cgit v1.2.1 From cd76374e9de4501acc74f833dc6cb5e7a5dca115 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 29 May 2008 00:30:21 -0700 Subject: suspend-vs-iommu: prevent suspend if we could not resume iommu/gart support misses suspend/resume code, which can do bad stuff, including memory corruption on resume. Prevent system suspend in case we would be unable to resume. Signed-off-by: Pavel Machek Tested-by: Patrick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index c07455d1695f..aa8ec928caa8 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -548,6 +549,28 @@ static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) return aper_base; } +static int gart_resume(struct sys_device *dev) +{ + return 0; +} + +static int gart_suspend(struct sys_device *dev, pm_message_t state) +{ + return -EINVAL; +} + +static struct sysdev_class gart_sysdev_class = { + .name = "gart", + .suspend = gart_suspend, + .resume = gart_resume, + +}; + +static struct sys_device device_gart = { + .id = 0, + .cls = &gart_sysdev_class, +}; + /* * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. @@ -558,7 +581,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - int i; + int i, error; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -606,6 +629,12 @@ static __init int init_k8_gatt(struct agp_kern_info *info) pci_write_config_dword(dev, 0x90, ctl); } + + error = sysdev_class_register(&gart_sysdev_class); + if (!error) + error = sysdev_register(&device_gart); + if (error) + panic("Could not register gart_sysdev -- would corrupt data on next suspend"); flush_gart(); printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", -- cgit v1.2.1 From 870568b39064cab2dd971fe57969916036982862 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 2 Jun 2008 15:57:27 -0700 Subject: x86, fpu: fix CONFIG_PREEMPT=y corruption of application's FPU stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jürgen Mell reported an FPU state corruption bug under CONFIG_PREEMPT, and bisected it to commit v2.6.19-1363-gacc2076, "i386: add sleazy FPU optimization". Add tsk_used_math() checks to prevent calling math_state_restore() which can sleep in the case of !tsk_used_math(). This prevents making a blocking call in __switch_to(). Apparently "fpu_counter > 5" check is not enough, as in some signal handling and fork/exec scenarios, fpu_counter > 5 and !tsk_used_math() is possible. It's a side effect though. This is the failing scenario: process 'A' in save_i387_ia32() just after clear_used_math() Got an interrupt and pre-empted out. At the next context switch to process 'A' again, kernel tries to restore the math state proactively and sees a fpu_counter > 0 and !tsk_used_math() This results in init_fpu() during the __switch_to()'s math_state_restore() And resulting in fpu corruption which will be saved/restored (save_i387_fxsave and restore_i387_fxsave) during the remaining part of the signal handling after the context switch. Bisected-by: Jürgen Mell Signed-off-by: Suresh Siddha Tested-by: Jürgen Mell Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- arch/x86/kernel/process_32.c | 5 ++++- arch/x86/kernel/process_64.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60d..6d5483356e74 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -649,8 +649,11 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter > 5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); /* diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988b..ac54ff56df80 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -658,8 +658,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now + * + * tsk_used_math() checks prevent calling math_state_restore(), + * which can sleep in the case of !tsk_used_math() */ - if (next_p->fpu_counter>5) + if (tsk_used_math(next_p) && next_p->fpu_counter > 5) math_state_restore(); return prev_p; } -- cgit v1.2.1 From b7f09ae583c49d28b2796d2fa5893dcf822e3a10 Mon Sep 17 00:00:00 2001 From: Miquel van Smoorenburg Date: Thu, 5 Jun 2008 18:14:44 +0200 Subject: x86, pci-dma.c: don't always add __GFP_NORETRY to gfp Currently arch/x86/kernel/pci-dma.c always adds __GFP_NORETRY to the allocation flags, because it wants to be reasonably sure not to deadlock when calling alloc_pages(). But really that should only be done in two cases: - when allocating memory in the lower 16 MB DMA zone. If there's no free memory there, waiting or OOM killing is of no use - when optimistically trying an allocation in the DMA32 zone when dma_mask < DMA_32BIT_MASK hoping that the allocation happens to fall within the limits of the dma_mask Also blindly adding __GFP_NORETRY to the the gfp variable might not be a good idea since we then also use it when calling dma_ops->alloc_coherent(). Clearing it might also not be a good idea, dma_alloc_coherent()'s caller might have set it on purpose. The gfp variable should not be clobbered. [ mingo@elte.hu: converted to delta patch ontop of previous version. ] Signed-off-by: Miquel van Smoorenburg Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 069e843f0b93..dc00a1331ace 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -378,6 +378,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, struct page *page; unsigned long dma_mask = 0; dma_addr_t bus; + int noretry = 0; /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); @@ -397,19 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dev->dma_mask == NULL) return NULL; + /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ + if (gfp & __GFP_DMA) + noretry = 1; + #ifdef CONFIG_X86_64 /* Why <=? Even when the mask is smaller than 4GB it is often larger than 16MB and in this case we have a chance of finding fitting memory in the next higher zone first. If not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { gfp |= GFP_DMA32; + if (dma_mask < DMA_32BIT_MASK) + noretry = 1; + } #endif again: - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ page = dma_alloc_pages(dev, - (gfp & GFP_DMA) ? gfp | __GFP_NORETRY : gfp, get_order(size)); + noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); if (page == NULL) return NULL; -- cgit v1.2.1 From 3703f39965a197ebd91743fc38d0f640606b8da3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 4 Jun 2008 18:13:37 +0200 Subject: geode: fix modular build -tip testing found this build bug: MODPOST 331 modules ERROR: "geode_mfgpt_toggle_event" [drivers/watchdog/geodewdt.ko] undefined! ERROR: "geode_mfgpt_alloc_timer" [drivers/watchdog/geodewdt.ko] undefined! make[1]: *** [__modpost] Error 1 make: *** [modules] Error 2 with this config: http://redhat.com/~mingo/misc/config-Wed_Jun__4_18_01_59_CEST_2008.bad export those symbols. Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3cad17fe026b..07c0f828f488 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -155,6 +155,7 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) wrmsr(msr, value, dummy); return 0; } +EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) { @@ -222,6 +223,7 @@ int geode_mfgpt_alloc_timer(int timer, int domain) /* No timers available - too bad */ return -1; } +EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); #ifdef CONFIG_GEODE_MFGPT_TIMER -- cgit v1.2.1 From 86b2b70e156203149c3861455feec54bc4906e6d Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Mon, 2 Jun 2008 17:21:06 -0400 Subject: x86: fix asm warning in head_32.S On Mon, May 19, 2008 at 04:10:02PM -0700, Linus Torvalds wrote: > It also causes these warnings on 32-bit PAE: > > AS arch/x86/kernel/head_32.o > arch/x86/kernel/head_32.S: Assembler messages: > arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed > arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed > > and I do not see why (the end result seems to be identical). Fix head_32.S gcc bignum warnings when CONFIG_PAE=y. arch/x86/kernel/head_32.S: Assembler messages: arch/x86/kernel/head_32.S:225: Warning: left operand is a bignum; integer 0 assumed arch/x86/kernel/head_32.S:609: Warning: left operand is a bignum; integer 0 assumed The assembler was stumbling over the 64-bit constant 0x100000000 in the KPMDS #define. Testing: a cmp(1) on head_32.o before and after shows the binary is unchanged. Signed-off-by: Joe Korty Cc: Theodore Tso Cc: Gabriel C Cc: Keith Packard Cc: "Pallipadi Venkatesh" Cc: Eric Anholt Cc: "Siddha Suresh B" Cc: bugme-daemon@bugzilla.kernel.org Cc: airlied@linux.ie Cc: "Barnes Jesse" Cc: Jeremy Fitzhardinge Cc: Andrew Morton Cc: "Rafael J. Wysocki" Cc: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b2cc73768a9d..f7357cc0162c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -189,7 +189,7 @@ default_entry: * this stage. */ -#define KPMDS ((0x100000000-__PAGE_OFFSET) >> 30) /* Number of kernel PMDs */ +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ xorl %ebx,%ebx /* %ebx is kept at zero */ -- cgit v1.2.1 From 0b6a39f7ebcb1c82587ce35b401c513eed41ac5c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 9 Jun 2008 13:29:43 +0200 Subject: Revert "x86: fix ioapic bug again" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 6e908947b4995bc0e551a8257c586d5c3e428201. Németh Márton reported: | there is a problem in 2.6.26-rc3 which was not there in case of | 2.6.25: the CPU wakes up ~90,000 times per sec instead of ~60 per sec. | | I also "git bisected" the problem, the result is: | | 6e908947b4995bc0e551a8257c586d5c3e428201 is first bad commit | commit 6e908947b4995bc0e551a8257c586d5c3e428201 | Author: Ingo Molnar | Date: Fri Mar 21 14:32:36 2008 +0100 | | x86: fix ioapic bug again the original problem is fixed by Maciej W. Rozycki in the tip/x86/apic branch (confirmed by Márton), but those changes are too intrusive for v2.6.26 so we'll go for the less intrusive (repeated) revert now. Reported-and-bisected-by: Németh Márton Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 12 ++---------- arch/x86/kernel/nmi_32.c | 9 ++------- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index a40d54fc1fdd..4dc8600d9d20 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2130,14 +2130,10 @@ static inline void __init check_timer(void) { int apic1, pin1, apic2, pin2; int vector; - unsigned int ver; unsigned long flags; local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - /* * get/set the timer IRQ vector: */ @@ -2150,15 +2146,11 @@ static inline void __init check_timer(void) * mode for the 8259A whenever interrupts are routed * through I/O APICs. Also IRQ0 has to be enabled in * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. Finally timer interrupts - * need to be acknowledged manually in the 8259A for - * timer_interrupt() and for the i82489DX when using - * the NMI watchdog. + * disabled in the local APIC. */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - timer_ack = !cpu_has_tsc; - timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); + timer_ack = 1; if (timer_over_8254 > 0) enable_8259A_irq(0); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 11b14bbaa61e..84160f74eeb0 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -26,7 +26,6 @@ #include #include -#include #include "mach_traps.h" @@ -82,7 +81,7 @@ int __init check_nmi_watchdog(void) prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); if (!prev_nmi_count) - goto error; + return -1; printk(KERN_INFO "Testing NMI watchdog ... "); @@ -119,7 +118,7 @@ int __init check_nmi_watchdog(void) if (!atomic_read(&nmi_active)) { kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - goto error; + return -1; } printk("OK.\n"); @@ -130,10 +129,6 @@ int __init check_nmi_watchdog(void) kfree(prev_nmi_count); return 0; -error: - timer_ack = !cpu_has_tsc; - - return -1; } static int __init setup_nmi_watchdog(char *str) -- cgit v1.2.1 From e32e58a96de4ac35a03349db2ab69f263ded958f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Jun 2008 10:14:08 +0200 Subject: x86: fix lockdep warning during suspend-to-ram Andrew Morton wrote: > I've been seeing the below for a long time during suspend-to-ram on the Vaio. > > > PM: Syncing filesystems ... done. > PM: Preparing system for mem sleep > Freezing user space processes ... <4>------------[ cut here ]------------ > WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x127() > Modules linked in: i915 drm ipw2200 sonypi ipv6 autofs4 hidp l2cap bluetooth sunrpc nf_conntrack_netbios_ns ipt_REJECT nf_conntrack_ipv4 xt_state nf_conntrack xt_tcpudp iptable_filter ip_tables x_tables acpi_cpufreq nvram ohci1394 ieee1394 ehci_hcd uhci_hcd sg joydev snd_hda_intel snd_seq_dummy sr_mod snd_seq_oss cdrom snd_seq_midi_event snd_seq snd_seq_device snd_pcm_oss snd_mixer_oss ieee80211 pcspkr ieee80211_crypt snd_pcm i2c_i801 snd_timer i2c_core ide_pci_generic piix snd soundcore snd_page_alloc button ext3 jbd ide_disk ide_core [last unloaded: ipw2200] > Pid: 3250, comm: zsh Not tainted 2.6.26-rc5 #1 > [] warn_on_slowpath+0x41/0x6d > [] ? native_sched_clock+0x82/0x96 > [] ? mark_held_locks+0x41/0x5c > [] ? _spin_unlock_irqrestore+0x36/0x58 > [] ? trace_hardirqs_on+0xe6/0x10d > [] ? __lock_acquire+0xae3/0xb2b > [] ? schedule+0x39b/0x3b4 > [] check_flags+0x4c/0x127 > [] lock_acquire+0x3a/0x86 > [] _spin_lock+0x26/0x53 > [] ? refrigerator+0x13/0xc3 > [] refrigerator+0x13/0xc3 > [] get_signal_to_deliver+0x3c/0x31e > [] do_notify_resume+0x91/0x6ee > [] ? lock_release_holdtime+0x50/0x56 > [] ? _spin_unlock_irqrestore+0x36/0x58 > [] ? read_chan+0x0/0x58c > [] ? trace_hardirqs_on+0xe6/0x10d > [] ? _spin_unlock_irqrestore+0x42/0x58 > [] ? tty_ldisc_deref+0x5c/0x63 > [] ? tty_read+0x66/0x98 > [] ? audit_syscall_exit+0x2aa/0x2c5 > [] ? do_syscall_trace+0x6b/0x16f > [] work_notifysig+0x13/0x1b > ======================= > ---[ end trace 25b49fe59a25afa5 ]--- > possible reason: unannotated irqs-off. > irq event stamp: 58919 > hardirqs last enabled at (58919): [] syscall_exit_work+0x11/0x26 Joy - I so love entry.S Best I can make of it: syscall_exit_work resume_userspace DISABLE_INTERRUPTS (no TRACE_IRQS_OFF) work_pending work_notifysig do_notify_resume() do_signal() get_signal_to_deliver() try_to_freeze() refrigerator() task_lock() -> check_flags() -> BANG The normal path is: syscall_exit_work resume_userspace DISABLE_INTERRUPTS restore_all TRACE_IRQS_IRET iret No idea why that would not warn.. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2a609dc3271c..c778e4fa55a2 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -248,6 +248,7 @@ ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret + TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done on # int/exception return? -- cgit v1.2.1 From 4461145ef1be92851c230f858f6b6f457c99670f Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Thu, 12 Jun 2008 08:55:59 +0200 Subject: x86, lockdep: fix "WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x128()" Alessandro Suardi reported: > Recently upgraded my FC6 desktop to Fedora 9; with the > latest nautilus RPM updates my VNC session went nuts > with nautilus pegging the CPU for everything that breathed. > > I now reverted to an earlier nautilus package, but during > the peak CPU period my kernel spat this: > > [314185.623294] ------------[ cut here ]------------ > [314185.623414] WARNING: at kernel/lockdep.c:2658 check_flags+0x4c/0x128() > [314185.623514] Modules linked in: iptable_filter ip_tables x_tables > sunrpc ipv6 fuse snd_via82xx snd_ac97_codec ac97_bus snd_mpu401_uart > snd_rawmidi via686a hwmon parport_pc sg parport uhci_hcd ehci_hcd > [314185.623924] Pid: 12314, comm: nautilus Not tainted 2.6.26-rc5-git2 #4 > [314185.624021] [] warn_on_slowpath+0x41/0x7b > [314185.624021] [] ? do_page_fault+0x2c1/0x5fd > [314185.624021] [] ? up_read+0x16/0x28 > [314185.624021] [] ? do_page_fault+0x2c1/0x5fd > [314185.624021] [] ? __lock_acquire+0xbb4/0xbc3 > [314185.624021] [] check_flags+0x4c/0x128 > [314185.624021] [] lock_acquire+0x31/0x7d > [314185.624021] [] __atomic_notifier_call_chain+0x30/0x80 > [314185.624021] [] ? __atomic_notifier_call_chain+0x0/0x80 > [314185.624021] [] atomic_notifier_call_chain+0xc/0xe > [314185.624021] [] notify_die+0x2d/0x2f > [314185.624021] [] do_int3+0x1f/0x4d > [314185.624021] [] int3+0x27/0x2c > [314185.624021] ======================= > [314185.624021] ---[ end trace 1923f65a2d7bb246 ]--- > [314185.624021] possible reason: unannotated irqs-off. > [314185.624021] irq event stamp: 488879 > [314185.624021] hardirqs last enabled at (488879): [] > restore_nocheck+0x12/0x15 > [314185.624021] hardirqs last disabled at (488878): [] > work_resched+0x19/0x30 > [314185.624021] softirqs last enabled at (488876): [] > __do_softirq+0xa6/0xac > [314185.624021] softirqs last disabled at (488865): [] > do_softirq+0x57/0xa6 > > I didn't seem to find it with some googling, so here it is. > > I was incidentally ltracing that process to try and find out > what was gulping down that much CPU (sorry, no idea > whether ltrace and the WARNING happened at the same > time or which came first) and: Yeah, this is extremely likely to be the source of the warning. The warning should be harmless, however. > Box is my trusty noname K7-800, 512MB RAM; if there's > anything else useful I might be able to provide, just ask. It would be interesting to see where the int3 comes from. Too bad, lockdep doesn't provide the register dump. The stacktrace also doesn't go further than the int3(), I wonder if this int3 came from userspace? The ltrace readme says "software breakpoints, like gdb", so I guess this is the case. Yep, seems like it. This looks relevant: | commit fb1dac909d94ff807cd833d340c6827c3a957159 | Author: Peter Zijlstra | Date: Wed Jan 16 09:51:59 2008 +0100 | | lockdep: more hardirq annotations for notify_die() I'm attaching a similarly-looking patch for this case (DO_VM86_ERROR), though I suspect it might be missing for the other cases (DO_ERROR/DO_ERROR_INFO) as well. Reported-by: Alessandro Suardi Signed-off-by: Vegard Nossum Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bde6f63e15d5..08d752de4eee 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -544,6 +544,7 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ + trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ -- cgit v1.2.1 From 1da2e3d679a8ea2d9e82040359a706da0bd3bef6 Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Thu, 12 Jun 2008 15:21:54 -0700 Subject: provide rtc_cmos platform device Recently (around 2.6.25) I've noticed that RTC no longer works for me. It turned out this is because I use pnpacpi=off kernel option to work around the parport_pc bugs. I always did so, but RTC used to work fine in the past, and now it have regressed. The patch fixes the problem by creating the platform device for the RTC when PNP is disabled. This may also help running the PNP-enabled kernel on an older PCs. Signed-off-by: Stas Sergeev Cc: David Brownell Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Bjorn Helgaas Cc: Adam Belay Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/rtc.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 9615eee9b775..05191bbc68b8 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include #include @@ -197,3 +199,35 @@ unsigned long long native_read_tsc(void) } EXPORT_SYMBOL(native_read_tsc); + +static struct resource rtc_resources[] = { + [0] = { + .start = RTC_PORT(0), + .end = RTC_PORT(1), + .flags = IORESOURCE_IO, + }, + [1] = { + .start = RTC_IRQ, + .end = RTC_IRQ, + .flags = IORESOURCE_IRQ, + } +}; + +static struct platform_device rtc_device = { + .name = "rtc_cmos", + .id = -1, + .resource = rtc_resources, + .num_resources = ARRAY_SIZE(rtc_resources), +}; + +static __init int add_rtc_cmos(void) +{ +#ifdef CONFIG_PNP + if (!pnp_platform_devices) + platform_device_register(&rtc_device); +#else + platform_device_register(&rtc_device); +#endif /* CONFIG_PNP */ + return 0; +} +device_initcall(add_rtc_cmos); -- cgit v1.2.1 From 75118a82e21cafb4a82b53bb85d1c7689787e046 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 13 Jun 2008 15:47:12 -0700 Subject: x86: fix NULL pointer deref in __switch_to Patrick McHardy reported a crash: > > I get this oops once a day, its apparently triggered by something > > run by cron, but the process is a different one each time. > > > > Kernel is -git from yesterday shortly before the -rc6 release > > (last commit is the usb-2.6 merge, the x86 patches are missing), > > .config is attached. > > > > I'll retry with current -git, but the patches that have gone in > > since I last updated don't look related. > > > > [62060.043009] BUG: unable to handle kernel NULL pointer dereference at > > 000001ff > > [62060.043009] IP: [] __switch_to+0x2f/0x118 > > [62060.043009] *pde = 00000000 > > [62060.043009] Oops: 0002 [#1] PREEMPT Vegard Nossum analyzed it: > This decodes to > > 0: 0f ae 00 fxsave (%eax) > > so it's related to the floating-point context. This is the exact > location of the crash: > > $ addr2line -e arch/x86/kernel/process_32.o -i ab0 > include/asm/i387.h:232 > include/asm/i387.h:262 > arch/x86/kernel/process_32.c:595 > > ...so it looks like prev_task->thread.xstate->fxsave has become NULL. > Or maybe it never had any other value. Somehow (as described below) TS_USEDFPU is set but the fpu is not allocated or freed. Another possible FPU pre-emption issue with the sleazy FPU optimization which was benign before but not so anymore, with the dynamic FPU allocation patch. New task is getting exec'd and it is prempted at the below point. flush_thread() { ... /* * Forget coprocessor state.. */ clear_fpu(tsk); <----- Preemption point clear_used_math(); ... } Now when it context switches in again, as the used_math() is still set and fpu_counter can be > 5, we will do a math_state_restore() which sets the task's TS_USEDFPU. After it continues from the above preemption point it does clear_used_math() and much later free_thread_xstate(). Now, at the next context switch, it is quite possible that xstate is null, used_math() is not set and TS_USEDFPU is still set. This will trigger unlazy_fpu() causing kernel oops. Fix this by clearing tsk's fpu_counter before clearing task's fpu. Reported-by: Patrick McHardy Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d5483356e74..e2db9ac5c61c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -333,6 +333,7 @@ void flush_thread(void) /* * Forget coprocessor state.. */ + tsk->fpu_counter = 0; clear_fpu(tsk); clear_used_math(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ac54ff56df80..c6eb5c91e5f6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -294,6 +294,7 @@ void flush_thread(void) /* * Forget coprocessor state.. */ + tsk->fpu_counter = 0; clear_fpu(tsk); clear_used_math(); } -- cgit v1.2.1 From df17b1d990fc214f033c5588e58216ec941591e0 Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sun, 15 Jun 2008 02:19:56 +0200 Subject: x86, 32-bit: fix boot failure on TSC-less processors Booting 2.6.26-rc6 on my 486 DX/4 fails with a "BUG: Int 6" (invalid opcode) and a kernel halt immediately after the kernel has been uncompressed. The BUG shows EIP pointing to an rdtsc instruction in native_read_tsc(), invoked from native_sched_clock(). (This error occurs so early that not even the serial console can capture it.) A bisection showed that this bug first occurs in 2.6.26-rc3-git7, via commit 9ccc906c97e34fd91dc6aaf5b69b52d824386910: >x86: distangle user disabled TSC from unstable > >tsc_enabled is set to 0 from the command line switch "notsc" and from >the mark_tsc_unstable code. Seperate those functionalities and replace >tsc_enable with tsc_disable. This makes also the native_sched_clock() >decision when to use TSC understandable. > >Preparatory patch to solve the sched_clock() issue on 32 bit. > >Signed-off-by: Thomas Gleixner The core reason for this bug is that native_sched_clock() gets called before tsc_init(). Before the commit above, tsc_32.c used a "tsc_enabled" variable which defaulted to 0 == disabled, and which only got enabled late in tsc_init(). Thus early calls to native_sched_clock() would skip the TSC and use jiffies instead. After the commit above, tsc_32.c uses a "tsc_disabled" variable which defaults to 0, meaning that the TSC is Ok to use. Early calls to native_sched_clock() now erroneously try to use the TSC on !cpu_has_tsc processors, leading to invalid opcode exceptions. My proposed fix is to initialise tsc_disabled to a "soft disabled" state distinct from the hard disabled state set up by the "notsc" kernel option. This fixes the native_sched_clock() problem. It also allows tsc_init() to be simplified: instead of setting tsc_disabled = 1 on every error return, we just set tsc_disabled = 0 once when all checks have succeeded. I've verified that this lets my 486 boot again. I've also verified that a Core2 machine still uses the TSC as clocksource after the patch. Signed-off-by: Mikael Pettersson Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_32.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 068759db63dd..65b70637ad97 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -14,7 +14,10 @@ #include "mach_timer.h" -static int tsc_disabled; +/* native_sched_clock() is called before tsc_init(), so + we must start with the TSC soft disabled to prevent + erroneous rdtsc usage on !cpu_has_tsc processors */ +static int tsc_disabled = -1; /* * On some systems the TSC frequency does not @@ -402,25 +405,20 @@ void __init tsc_init(void) { int cpu; - if (!cpu_has_tsc || tsc_disabled) { - /* Disable the TSC in case of !cpu_has_tsc */ - tsc_disabled = 1; + if (!cpu_has_tsc || tsc_disabled > 0) return; - } cpu_khz = calculate_cpu_khz(); tsc_khz = cpu_khz; if (!cpu_khz) { mark_tsc_unstable("could not calculate TSC khz"); - /* - * We need to disable the TSC completely in this case - * to prevent sched_clock() from using it. - */ - tsc_disabled = 1; return; } + /* now allow native_sched_clock() to use rdtsc */ + tsc_disabled = 0; + printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); -- cgit v1.2.1 From d3942cff620bea073fc4e3c8ed878eb1e84615ce Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Sun, 8 Jun 2008 16:16:07 +0200 Subject: x86: use BOOTMEM_EXCLUSIVE on 32-bit This patch uses the BOOTMEM_EXCLUSIVE for crashkernel reservation also for i386 and prints a error message on failure. The patch is still for 2.6.26 since it is only bug fixing. The unification of reserve_crashkernel() between i386 and x86_64 should be done for 2.6.27. Signed-off-by: Bernhard Walle Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/setup_32.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 2c5f8b213e86..5a2f8e063887 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -532,10 +532,16 @@ static void __init reserve_crashkernel(void) (unsigned long)(crash_size >> 20), (unsigned long)(crash_base >> 20), (unsigned long)(total_mem >> 20)); + + if (reserve_bootmem(crash_base, crash_size, + BOOTMEM_EXCLUSIVE) < 0) { + printk(KERN_INFO "crashkernel reservation " + "failed - memory is in use\n"); + return; + } + crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; - reserve_bootmem(crash_base, crash_size, - BOOTMEM_DEFAULT); } else printk(KERN_INFO "crashkernel reservation failed - " "you have to specify a base address\n"); -- cgit v1.2.1 From ffe6e1da86d21d7855495b5a772c93f050258f6e Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Wed, 18 Jun 2008 11:34:38 -0600 Subject: x86, geode: add a VSA2 ID for General Software General Software writes their own VSA2 module for their version of the Geode BIOS, which returns a different ID then the standard VSA2. This was causing the framebuffer driver to break for most GSW boards. Signed-off-by: Jordan Crouse Cc: tglx@linutronix.de Cc: linux-geode@lists.infradead.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/geode_32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index e8edd63ab000..9b08e852fd1a 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -166,6 +166,8 @@ int geode_has_vsa2(void) static int has_vsa2 = -1; if (has_vsa2 == -1) { + u16 val; + /* * The VSA has virtual registers that we can query for a * signature. @@ -173,7 +175,8 @@ int geode_has_vsa2(void) outw(VSA_VR_UNLOCK, VSA_VRC_INDEX); outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX); - has_vsa2 = (inw(VSA_VRC_DATA) == VSA_SIG); + val = inw(VSA_VRC_DATA); + has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG); } return has_vsa2; -- cgit v1.2.1 From 7af192c954017499ec163bc9dbaaee2e593d7ef2 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:29 +0200 Subject: x86: Add structs and functions for paravirt clocksource This patch adds structs for the paravirt clocksource ABI used by both xen and kvm (pvclock-abi.h). It also adds some helper functions to read system time and wall clock time from a paravirtual clocksource (pvclock.[ch]). They are based on the xen code. They are enabled using CONFIG_PARAVIRT_CLOCK. Subsequent patches of this series will put the code in use. Signed-off-by: Gerd Hoffmann Acked-by: Jeremy Fitzhardinge Signed-off-by: Avi Kivity --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/pvclock.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 arch/x86/kernel/pvclock.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5e618c3b4720..77807d4769c9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c new file mode 100644 index 000000000000..05fbe9a0325a --- /dev/null +++ b/arch/x86/kernel/pvclock.c @@ -0,0 +1,141 @@ +/* paravirtual clock -- common code used by kvm/xen + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include +#include +#include + +/* + * These are perodically updated + * xen: magic shared_info page + * kvm: gpa registered via msr + * and then copied here. + */ +struct pvclock_shadow_time { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) +{ + u64 delta = native_read_tsc() - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +/* + * Reads a consistent set of time-base values from hypervisor, + * into a shadow data area. + */ +static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, + struct pvclock_vcpu_time_info *src) +{ + do { + dst->version = src->version; + rmb(); /* fetch version before data */ + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); /* test version after fetching data */ + } while ((src->version & 1) || (dst->version != src->version)); + + return dst->version; +} + +cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) +{ + struct pvclock_shadow_time shadow; + unsigned version; + cycle_t ret, offset; + + do { + version = pvclock_get_time_values(&shadow, src); + barrier(); + offset = pvclock_get_nsec_offset(&shadow); + ret = shadow.system_timestamp + offset; + barrier(); + } while (version != src->version); + + return ret; +} + +void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, + struct pvclock_vcpu_time_info *vcpu_time, + struct timespec *ts) +{ + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = wall_clock->version; + rmb(); /* fetch version before time */ + now.tv_sec = wall_clock->sec; + now.tv_nsec = wall_clock->nsec; + rmb(); /* fetch time before checking version */ + } while ((wall_clock->version & 1) || (version != wall_clock->version)); + + delta = pvclock_clocksource_read(vcpu_time); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} -- cgit v1.2.1 From f6e16d5ad463d15f285666f588cfe49495c692d9 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Tue, 3 Jun 2008 16:17:32 +0200 Subject: x86: KVM guest: Use the paravirt clocksource structs and functions This patch updates the kvm host code to use the pvclock structs and functions, thereby making it compatible with Xen. The patch also fixes an initialization bug: on SMP systems the per-cpu has two different locations early at boot and after CPU bringup. kvmclock must take that in account when registering the physical address within the host. Signed-off-by: Gerd Hoffmann Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 89 +++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 56 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 08a30986d472..87edf1ceb1df 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -36,18 +37,9 @@ static int parse_no_kvmclock(char *arg) early_param("no-kvmclock", parse_no_kvmclock); /* The hypervisor will put information about time periodically here */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); -#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field +static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); +static struct pvclock_wall_clock wall_clock; -static inline u64 kvm_get_delta(u64 last_tsc) -{ - int cpu = smp_processor_id(); - u64 delta = native_read_tsc() - last_tsc; - return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; -} - -static struct kvm_wall_clock wall_clock; -static cycle_t kvm_clock_read(void); /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for @@ -55,64 +47,37 @@ static cycle_t kvm_clock_read(void); */ static unsigned long kvm_get_wallclock(void) { - u32 wc_sec, wc_nsec; - u64 delta; + struct pvclock_vcpu_time_info *vcpu_time; struct timespec ts; - int version, nsec; int low, high; low = (int)__pa(&wall_clock); high = ((u64)__pa(&wall_clock) >> 32); + native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - delta = kvm_clock_read(); + vcpu_time = &get_cpu_var(hv_clock); + pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); + put_cpu_var(hv_clock); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); - do { - version = wall_clock.wc_version; - rmb(); - wc_sec = wall_clock.wc_sec; - wc_nsec = wall_clock.wc_nsec; - rmb(); - } while ((wall_clock.wc_version != version) || (version & 1)); - - delta = kvm_clock_read() - delta; - delta += wc_nsec; - nsec = do_div(delta, NSEC_PER_SEC); - set_normalized_timespec(&ts, wc_sec + delta, nsec); - /* - * Of all mechanisms of time adjustment I've tested, this one - * was the champion! - */ - return ts.tv_sec + 1; + return ts.tv_sec; } static int kvm_set_wallclock(unsigned long now) { - return 0; + return -1; } -/* - * This is our read_clock function. The host puts an tsc timestamp each time - * it updates a new time. Without the tsc adjustment, we can have a situation - * in which a vcpu starts to run earlier (smaller system_time), but probes - * time later (compared to another vcpu), leading to backwards time - */ static cycle_t kvm_clock_read(void) { - u64 last_tsc, now; - int cpu; + struct pvclock_vcpu_time_info *src; + cycle_t ret; - preempt_disable(); - cpu = smp_processor_id(); - - last_tsc = get_clock(cpu, tsc_timestamp); - now = get_clock(cpu, system_time); - - now += kvm_get_delta(last_tsc); - preempt_enable(); - - return now; + src = &get_cpu_var(hv_clock); + ret = pvclock_clocksource_read(src); + put_cpu_var(hv_clock); + return ret; } + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -123,13 +88,14 @@ static struct clocksource kvm_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(void) +static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high; low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); - + printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", + cpu, high, low, txt); return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); } @@ -140,12 +106,20 @@ static void kvm_setup_secondary_clock(void) * Now that the first cpu already had this clocksource initialized, * we shouldn't fail. */ - WARN_ON(kvm_register_clock()); + WARN_ON(kvm_register_clock("secondary cpu clock")); /* ok, done with our trickery, call native */ setup_secondary_APIC_clock(); } #endif +#ifdef CONFIG_SMP +void __init kvm_smp_prepare_boot_cpu(void) +{ + WARN_ON(kvm_register_clock("primary cpu clock")); + native_smp_prepare_boot_cpu(); +} +#endif + /* * After the clock is registered, the host will keep writing to the * registered memory location. If the guest happens to shutdown, this memory @@ -174,13 +148,16 @@ void __init kvmclock_init(void) return; if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock()) + if (kvm_register_clock("boot clock")) return; pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; +#endif +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #endif machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC -- cgit v1.2.1 From fcb43042ef55d2f46b0efa5d7746967cef38f056 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 24 Jun 2008 16:06:23 +0800 Subject: x86: fix cpu hotplug crash Vegard Nossum reported crashes during cpu hotplug tests: http://marc.info/?l=linux-kernel&m=121413950227884&w=4 In function _cpu_up, the panic happens when calling __raw_notifier_call_chain at the second time. Kernel doesn't panic when calling it at the first time. If just say because of nr_cpu_ids, that's not right. By checking the source code, I found that function do_boot_cpu is the culprit. Consider below call chain: _cpu_up=>__cpu_up=>smp_ops.cpu_up=>native_cpu_up=>do_boot_cpu. So do_boot_cpu is called in the end. In do_boot_cpu, if boot_error==true, cpu_clear(cpu, cpu_possible_map) is executed. So later on, when _cpu_up calls __raw_notifier_call_chain at the second time to report CPU_UP_CANCELED, because this cpu is already cleared from cpu_possible_map, get_cpu_sysdev returns NULL. Many resources are related to cpu_possible_map, so it's better not to change it. Below patch against 2.6.26-rc7 fixes it by removing the bit clearing in cpu_possible_map. Signed-off-by: Zhang Yanmin Tested-by: Vegard Nossum Acked-by: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 56078d61c793..3e1cecedde42 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -996,7 +996,6 @@ do_rest: #endif cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ - cpu_clear(cpu, cpu_possible_map); cpu_clear(cpu, cpu_present_map); per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; } -- cgit v1.2.1 From 11dbc963a8f6128595d0f6ecf138dc369e144997 Mon Sep 17 00:00:00 2001 From: TAKADA Yoshihito Date: Mon, 30 Jun 2008 13:44:45 +0900 Subject: ptrace GET/SET FPXREGS broken When I update kernel 2.6.25 from 2.6.24, gdb does not work. On 2.6.25, ptrace(PTRACE_GETFPXREGS, ...) returns ENODEV. But 2.6.24 kernel's ptrace() returns EIO. It is issue of compatibility. I attached test program as pt.c and patch for fix it. #include #include #include #include #include #include #include struct user_fxsr_struct { unsigned short cwd; unsigned short swd; unsigned short twd; unsigned short fop; long fip; long fcs; long foo; long fos; long mxcsr; long reserved; long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ long padding[56]; }; int main(void) { pid_t pid; pid = fork(); switch(pid){ case -1:/* error */ break; case 0:/* child */ child(); break; default: parent(pid); break; } return 0; } int child(void) { ptrace(PTRACE_TRACEME); kill(getpid(), SIGSTOP); sleep(10); return 0; } int parent(pid_t pid) { int ret; struct user_fxsr_struct fpxregs; ret = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpxregs); if(ret < 0){ printf("%d: %s.\n", errno, strerror(errno)); } kill(pid, SIGCONT); wait(pid); return 0; } /* in the kerel, at kernel/i387.c get_fpxregs() */ Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8efb82..95e80e5033c3 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -162,7 +162,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, int ret; if (!cpu_has_fxsr) - return -ENODEV; + return -EIO; ret = init_fpu(target); if (ret) @@ -179,7 +179,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, int ret; if (!cpu_has_fxsr) - return -ENODEV; + return -EIO; ret = init_fpu(target); if (ret) -- cgit v1.2.1 From 216705d2720dedf630b55d641737f430ead0c228 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jul 2008 22:48:03 +0100 Subject: x86: fix Intel Mac booting with EFI Fedora reports that mem_init()'s zap_low_mappings(), extended to SMP in 61165d7a035f6571c7576e7f51e7230157724c8d x86: fix app crashes after SMP resume causes 32-bit Intel Mac machines to reboot very early when booting with EFI. The EFI code appears to manage low mappings for itself when needed; but like many before it, confuses PSE with PAE. So it has only been mapping half the space it needed when PSE but not PAE. This remained unnoticed until we moved the SMP zap_low_mappings() before efi_enter_virtual_mode(). Presumably could have been noticed years ago if anyone ran a UP kernel on such machines? Reported-by: Peter Jones Signed-off-by: Hugh Dickins Cc: Peter Jones Cc: Glauber Costa Cc: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar Tested-by: Peter Jones --- arch/x86/kernel/efi_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 5d23d85624d4..4b63c8e1f13b 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -49,13 +49,13 @@ void efi_call_phys_prelog(void) local_irq_save(efi_rt_eflags); /* - * If I don't have PSE, I should just duplicate two entries in page - * directory. If I have PSE, I just need to duplicate one entry in + * If I don't have PAE, I should just duplicate two entries in page + * directory. If I have PAE, I just need to duplicate one entry in * page directory. */ cr4 = read_cr4(); - if (cr4 & X86_CR4_PSE) { + if (cr4 & X86_CR4_PAE) { efi_bak_pg_dir_pointer[0].pgd = swapper_pg_dir[pgd_index(0)].pgd; swapper_pg_dir[0].pgd = @@ -93,7 +93,7 @@ void efi_call_phys_epilog(void) cr4 = read_cr4(); - if (cr4 & X86_CR4_PSE) { + if (cr4 & X86_CR4_PAE) { swapper_pg_dir[pgd_index(0)].pgd = efi_bak_pg_dir_pointer[0].pgd; } else { -- cgit v1.2.1 From 4b4f7280d7fd1feeff134c2cf2db32fd583b6c29 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 24 Jun 2008 23:03:48 +0200 Subject: x86 ACPI: normalize segment descriptor register on resume Some Dell laptops enter resume with apparent garbage in the segment descriptor registers (almost certainly the result of a botched transition from protected to real mode.) The only way to clean that up is to enter protected mode ourselves and clean out the descriptor registers. This fixes resume on Dell XPS M1210 and Dell D620. Reference: http://bugzilla.kernel.org/show_bug.cgi?id=10927 Signed-off-by: H. Peter Anvin Cc: Andrew Morton Cc: Pavel Machek Cc: pm list Cc: Len Brown Signed-off-by: Ingo Molnar Tested-by: Kirill A. Shutemov Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/realmode/wakeup.S | 38 +++++++++++++++++++++++++++++++++- arch/x86/kernel/acpi/realmode/wakeup.h | 5 +++++ arch/x86/kernel/acpi/sleep.c | 16 +++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index f9b77fb37e5b..3355973b12ac 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -5,6 +5,7 @@ #include #include #include +#include .code16 .section ".header", "a" @@ -24,6 +25,11 @@ pmode_gdt: .quad 0 realmode_flags: .long 0 real_magic: .long 0 trampoline_segment: .word 0 +_pad1: .byte 0 +wakeup_jmp: .byte 0xea /* ljmpw */ +wakeup_jmp_off: .word 3f +wakeup_jmp_seg: .word 0 +wakeup_gdt: .quad 0, 0, 0 signature: .long 0x51ee1111 .text @@ -34,11 +40,34 @@ _start: cli cld + /* Apparently some dimwit BIOS programmers don't know how to + program a PM to RM transition, and we might end up here with + junk in the data segment descriptor registers. The only way + to repair that is to go into PM and fix it ourselves... */ + movw $16, %cx + lgdtl %cs:wakeup_gdt + movl %cr0, %eax + orb $X86_CR0_PE, %al + movl %eax, %cr0 + jmp 1f +1: ljmpw $8, $2f +2: + movw %cx, %ds + movw %cx, %es + movw %cx, %ss + movw %cx, %fs + movw %cx, %gs + + andb $~X86_CR0_PE, %al + movl %eax, %cr0 + jmp wakeup_jmp +3: /* Set up segments */ movw %cs, %ax movw %ax, %ds movw %ax, %es movw %ax, %ss + lidtl wakeup_idt movl $wakeup_stack_end, %esp @@ -98,7 +127,14 @@ bogus_real_magic: jmp 1b .data - .balign 4 + .balign 8 + + /* This is the standard real-mode IDT */ +wakeup_idt: + .word 0xffff /* limit */ + .long 0 /* address */ + .word 0 + .globl HEAP, heap_end HEAP: .long wakeup_heap diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index ef8166fe8020..69d38d0b2b64 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -24,6 +24,11 @@ struct wakeup_header { u32 realmode_flags; u32 real_magic; u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ + u8 _pad1; + u8 wakeup_jmp; + u16 wakeup_jmp_off; + u16 wakeup_jmp_seg; + u64 wakeup_gdt[3]; u32 signature; /* To check we have correct structure */ } __attribute__((__packed__)); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index afc25ee9964b..36af01f029ed 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -50,6 +50,20 @@ int acpi_save_state_mem(void) header->video_mode = saved_video_mode; + header->wakeup_jmp_seg = acpi_wakeup_address >> 4; + /* GDT[0]: GDT self-pointer */ + header->wakeup_gdt[0] = + (u64)(sizeof(header->wakeup_gdt) - 1) + + ((u64)(acpi_wakeup_address + + ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) + << 16); + /* GDT[1]: real-mode-like code segment */ + header->wakeup_gdt[1] = (0x009bULL << 40) + + ((u64)acpi_wakeup_address << 16) + 0xffff; + /* GDT[2]: real-mode-like data segment */ + header->wakeup_gdt[2] = (0x0093ULL << 40) + + ((u64)acpi_wakeup_address << 16) + 0xffff; + #ifndef CONFIG_64BIT store_gdt((struct desc_ptr *)&header->pmode_gdt); @@ -111,7 +125,7 @@ void __init acpi_reserve_bootmem(void) return; } - acpi_wakeup_address = acpi_realmode; + acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); } -- cgit v1.2.1 From 64e83b5a919a65eb35b63dd7e07c188379ff8ce6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 5 Jul 2008 00:05:30 +0200 Subject: x86 ACPI: fix resume from suspend to RAM on uniprocessor x86-64 Since the trampoline code is now used for ACPI resume from suspend to RAM, the trampoline page tables have to be fixed up during boot not only on SMP systems, but also on UP systems that use the trampoline. Reference: http://bugzilla.kernel.org/show_bug.cgi?id=10923 Reported-by: Dionisus Torimens Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Andrew Morton Cc: pm list Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 10a1955bb1d1..b817974ef942 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -128,7 +128,7 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_TRAMPOLINE addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) #endif -- cgit v1.2.1