From 3f2f0680d1161df96a0e8fea16930f1bd487a9cf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jan 2015 15:08:51 +0100 Subject: x86/MCE/intel: Cleanup CMCI storm logic Initially, this started with the yet another report about a race condition in the CMCI storm adaptive period length thing. Yes, we have to admit, it is fragile and error prone. So let's simplify it. The simpler logic is: now, after we enter storm mode, we go straight to polling with CMCI_STORM_INTERVAL, i.e. once a second. We remain in storm mode as long as we see errors being logged while polling. Theoretically, if we see an uninterrupted error stream, we will remain in storm mode indefinitely and keep polling the MSRs. However, when the storm is actually a burst of errors, once we have logged them all, we back out of it after ~5 mins of polling and no more errors logged. If we encounter an error during those 5 minutes, we reset the polling interval to 5 mins. Making machine_check_poll() return a bool and denoting whether it has seen an error or not lets us simplify a bunch of code and move the storm handling private to mce_intel.c. Some minor cleanups while at it. Reported-by: Calvin Owens Tested-by: Tony Luck Link: http://lkml.kernel.org/r/1417746575-23299-1-git-send-email-calvinowens@fb.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 51b26e895933..13eeea518233 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -183,11 +183,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); enum mcp_flags { - MCP_TIMESTAMP = (1 << 0), /* log time stamp */ - MCP_UC = (1 << 1), /* log uncorrected errors */ - MCP_DONTLOG = (1 << 2), /* only clear, don't log */ + MCP_TIMESTAMP = BIT(0), /* log time stamp */ + MCP_UC = BIT(1), /* log uncorrected errors */ + MCP_DONTLOG = BIT(2), /* only clear, don't log */ }; -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b); +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b); int mce_notify_irq(void); void mce_notify_process(void); -- cgit v1.2.1 From bf80bbd7dcf525e41e0673fbaa8cd21d2344b460 Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 23 Mar 2015 10:42:52 -0500 Subject: x86/mce: Add an AMD severities-grading function Add a severities function that caters to AMD processors. This allows us to do some vendor-specific work within the function if necessary. Also, introduce a vendor flag bitfield for vendor-specific settings. The severities code uses this to define error scope based on the prescence of the flags field. This is based off of work by Boris Petkov. Testing details: Fam10h, Model 9h (Greyhound) Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo), Fam16h Model 00h-0fh (Kabini) Boris: Intel SNB AMD K8 (JH-E0) Signed-off-by: Aravind Gopalakrishnan Acked-by: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Chen Yucong Cc: Andy Lutomirski Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com [ Fixup build, clean up comments. ] Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index fd38a23e729f..b574fbf62d39 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,6 +116,12 @@ struct mca_config { u32 rip_msr; }; +struct mce_vendor_flags { + __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */ + __reserved_0 : 63; +}; +extern struct mce_vendor_flags mce_flags; + extern struct mca_config mca_cfg; extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); -- cgit v1.2.1 From 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f Mon Sep 17 00:00:00 2001 From: Aravind Gopalakrishnan Date: Mon, 23 Mar 2015 10:42:53 -0500 Subject: x86/mce: Define mce_severity function pointer Rename mce_severity() to mce_severity_intel() and assign the mce_severity function pointer to mce_severity_amd() during init on AMD. This way, we can avoid a test to call mce_severity_amd every time we get into mce_severity(). And it's cleaner to do it this way. Signed-off-by: Aravind Gopalakrishnan Suggested-by: Tony Luck Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Chen Yucong Cc: Andy Lutomirski Cc: linux-edac Link: http://lkml.kernel.org/r/1427125373-2918-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b574fbf62d39..1f5a86d518db 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -134,9 +134,11 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_vendor_init_severity(void) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE -- cgit v1.2.1