summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel/cpu/mcheck/mce_intel.c
diff options
context:
space:
mode:
authorChen, Gong <gong.chen@linux.intel.com>2014-03-27 21:24:36 -0400
committerTony Luck <tony.luck@intel.com>2014-03-28 13:40:16 -0700
commit27f6c573e0f77f7d1cc907c1494c99a61e48b7d8 (patch)
tree4af791c41a8dc43ae48d963c50a585c6a1c77041 /arch/x86/kernel/cpu/mcheck/mce_intel.c
parentb098d6726bbfb94c06d6e1097466187afddae61f (diff)
downloadtalos-obmc-linux-27f6c573e0f77f7d1cc907c1494c99a61e48b7d8.tar.gz
talos-obmc-linux-27f6c573e0f77f7d1cc907c1494c99a61e48b7d8.zip
x86, CMCI: Add proper detection of end of CMCI storms
When CMCI storm persists for a long time(at least beyond predefined threshold. It's 30 seconds for now), we can watch CMCI storm is detected immediately after it subsides. ... Dec 10 22:04:29 kernel: CMCI storm detected: switching to poll mode Dec 10 22:04:59 kernel: CMCI storm subsided: switching to interrupt mode Dec 10 22:04:59 kernel: CMCI storm detected: switching to poll mode Dec 10 22:05:29 kernel: CMCI storm subsided: switching to interrupt mode ... The problem is that our logic that determines that the storm has ended is incorrect. We announce the end, re-enable interrupts and realize that the storm is still going on, so we switch back to polling mode. Rinse, repeat. When a storm happens we disable signaling of errors via CMCI and begin polling machine check banks instead. If we find any logged errors, then we need to set a per-cpu flag so that our per-cpu tests that check whether the storm is ongoing will see that errors are still being logged independently of whether mce_notify_irq() says that the error has been fully processed. cmci_clear() is not the right tool to disable a bank. It disables the interrupt for the bank as desired, but it also clears the bit for this bank in "mce_banks_owned" so we will skip the bank when polling (so we fail to see that the storm continues because we stop looking). New cmci_storm_disable_banks() just disables the interrupt while allowing polling to continue. Reported-by: William Dauchy <wdauchy@gmail.com> Signed-off-by: Chen, Gong <gong.chen@linux.intel.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck/mce_intel.c')
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c19
1 files changed, 18 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index fb6156fee6f7..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -9,6 +9,7 @@
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
#include <asm/apic.h>
#include <asm/processor.h>
#include <asm/msr.h>
@@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
}
+static void cmci_storm_disable_banks(void)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = __get_cpu_var(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -158,7 +175,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
- cmci_clear();
+ cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL);
OpenPOWER on IntegriCloud