From 20d51a426fe9a0d0a63cc3a7488f621c8bac37e1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Aug 2015 18:29:33 +0200 Subject: x86/mce: Reuse one of the u16 padding fields in 'struct mce' ... to save the error severity of the MCE and whether the reported address of the error is usable. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1439396985-12812-2-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/uapi/asm/mce.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index a0eab85ce7b8..76880ede9a35 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -15,7 +15,8 @@ struct mce { __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ __u8 inject_flags; /* software inject flags */ - __u16 pad; + __u8 severity; + __u8 usable_addr; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ -- cgit v1.2.1 From 648ed94038c030245a06e4be59744fd5cdc18c40 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 12 Aug 2015 18:29:34 +0200 Subject: x86/mce: Provide a lockless memory pool to save error records printk() is not safe to use in MCE context. Add a lockless memory allocator pool to save error records in MCE context. Those records will be issued later, in a printk-safe context. The idea is inspired by the APEI/GHES driver. We're very conservative and allocate only two pages for it but since we're going to use those pages throughout the system's lifetime, we allocate them statically to avoid early boot time allocation woes. Signed-off-by: Chen, Gong [ Rewrite. ] Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1439396985-12812-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/cpu/mcheck/Makefile | 2 +- arch/x86/kernel/cpu/mcheck/mce-genpool.c | 99 +++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce-internal.h | 12 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 8 ++- 5 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 arch/x86/kernel/cpu/mcheck/mce-genpool.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b3a1a5d77d92..06dbb5da90c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -955,6 +955,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS config X86_MCE bool "Machine Check / overheating reporting" + select GENERIC_ALLOCATOR default y ---help--- Machine Check support allows the processor to notify the diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index bb34b03af252..a3311c886194 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,4 +1,4 @@ -obj-y = mce.o mce-severity.o +obj-y = mce.o mce-severity.o mce-genpool.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c new file mode 100644 index 000000000000..0a850100c594 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c @@ -0,0 +1,99 @@ +/* + * MCE event pool management in MCE context + * + * Copyright (C) 2015 Intel Corp. + * Author: Chen, Gong + * + * This file is licensed under GPLv2. + */ +#include +#include +#include +#include +#include "mce-internal.h" + +/* + * printk() is not safe in MCE context. This is a lock-less memory allocator + * used to save error information organized in a lock-less list. + * + * This memory pool is only to be used to save MCE records in MCE context. + * MCE events are rare, so a fixed size memory pool should be enough. Use + * 2 pages to save MCE events for now (~80 MCE records at most). + */ +#define MCE_POOLSZ (2 * PAGE_SIZE) + +static struct gen_pool *mce_evt_pool; +static LLIST_HEAD(mce_event_llist); +static char gen_pool_buf[MCE_POOLSZ]; + +void mce_gen_pool_process(void) +{ + struct llist_node *head; + struct mce_evt_llist *node; + struct mce *mce; + + head = llist_del_all(&mce_event_llist); + if (!head) + return; + + head = llist_reverse_order(head); + llist_for_each_entry(node, head, llnode) { + mce = &node->mce; + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); + } +} + +bool mce_gen_pool_empty(void) +{ + return llist_empty(&mce_event_llist); +} + +int mce_gen_pool_add(struct mce *mce) +{ + struct mce_evt_llist *node; + + if (!mce_evt_pool) + return -EINVAL; + + node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node)); + if (!node) { + pr_warn_ratelimited("MCE records pool full!\n"); + return -ENOMEM; + } + + memcpy(&node->mce, mce, sizeof(*mce)); + llist_add(&node->llnode, &mce_event_llist); + + return 0; +} + +static int mce_gen_pool_create(void) +{ + struct gen_pool *tmpp; + int ret = -ENOMEM; + + tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1); + if (!tmpp) + goto out; + + ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1); + if (ret) { + gen_pool_destroy(tmpp); + goto out; + } + + mce_evt_pool = tmpp; + +out: + return ret; +} + +int mce_gen_pool_init(void) +{ + /* Just init mce_gen_pool once. */ + if (mce_evt_pool) + return 0; + + return mce_gen_pool_create(); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fe32074b865b..ea8b62264c14 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -13,6 +13,8 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +extern struct atomic_notifier_head x86_mce_decoder_chain; + #define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ @@ -24,6 +26,16 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; +struct mce_evt_llist { + struct llist_node llnode; + struct mce mce; +}; + +void mce_gen_pool_process(void); +bool mce_gen_pool_empty(void); +int mce_gen_pool_add(struct mce *mce); +int mce_gen_pool_init(void); + extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff103c3..a41c014e5cde 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -118,7 +118,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); * CPU/chipset specific EDAC code can register a notifier call here to print * MCE errors in a human-readable form. */ -static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) @@ -1731,6 +1731,12 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) return; } + if (mce_gen_pool_init()) { + mca_cfg.disabled = true; + pr_emerg("Couldn't allocate MCE records pool!\n"); + return; + } + machine_check_vector = do_machine_check; __mcheck_cpu_init_generic(); -- cgit v1.2.1 From 061120aed7081b9a4393fbe07b558192f40ad911 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 12 Aug 2015 18:29:35 +0200 Subject: x86/mce: Don't use percpu workqueues An MCE is a rare event. Therefore, there's no need to have per-CPU instances of both normal and IRQ workqueues. Make them both global. Signed-off-by: Chen, Gong [ Fold in subsequent patch from Rui/Boris/Tony for early boot logging. ] Signed-off-by: Tony Luck [ Massage commit message. ] Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439396985-12812-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a41c014e5cde..456f8d7b8fd3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -110,7 +110,8 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { */ mce_banks_t mce_banks_ce_disabled; -static DEFINE_PER_CPU(struct work_struct, mce_work); +static struct work_struct mce_work; +static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); @@ -526,11 +527,9 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { if (!mce_ring_empty()) - schedule_work(this_cpu_ptr(&mce_work)); + schedule_work(&mce_work); } -static DEFINE_PER_CPU(struct irq_work, mce_irq_work); - static void mce_irq_work_cb(struct irq_work *entry) { mce_notify_irq(); @@ -551,7 +550,7 @@ static void mce_report_event(struct pt_regs *regs) return; } - irq_work_queue(this_cpu_ptr(&mce_irq_work)); + irq_work_queue(&mce_irq_work); } /* @@ -1742,8 +1741,6 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_timer(); - INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work); - init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb); } /* @@ -2064,6 +2061,9 @@ int __init mcheck_init(void) mcheck_intel_therm_init(); mcheck_vendor_init_severity(); + INIT_WORK(&mce_work, mce_process_work); + init_irq_work(&mce_irq_work, mce_irq_work_cb); + return 0; } -- cgit v1.2.1 From fd4cf79fcc4b5130ced8fd8c40378d3cec2e5fa8 Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 12 Aug 2015 18:29:36 +0200 Subject: x86/mce: Remove the MCE ring for Action Optional errors Use unified genpool to save Action Optional error events and put Action Optional error handling in the same notification chain as MCE error decoding. Signed-off-by: Chen, Gong [ Fold in subsequent patch from Boris for early boot logging. ] Signed-off-by: Tony Luck [ Correct a lot. ] Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439396985-12812-5-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 135 +++++++++++++++++---------------------- 2 files changed, 61 insertions(+), 76 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 982dfc3679ad..dfaa4de1dbb4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -140,7 +140,7 @@ struct mce_vendor_flags { extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; -extern void mce_register_decode_chain(struct notifier_block *nb); +extern void mce_register_decode_chain(struct notifier_block *nb, bool drain); extern void mce_unregister_decode_chain(struct notifier_block *nb); #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 456f8d7b8fd3..82603690b65c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -114,6 +114,7 @@ static struct work_struct mce_work; static struct irq_work mce_irq_work; static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); +static int mce_usable_address(struct mce *m); /* * CPU/chipset specific EDAC code can register a notifier call here to print @@ -234,11 +235,18 @@ static void drain_mcelog_buffer(void) } while (next != prev); } +static struct notifier_block mce_srao_nb; -void mce_register_decode_chain(struct notifier_block *nb) +void mce_register_decode_chain(struct notifier_block *nb, bool drain) { + /* Ensure SRAO notifier has the highest priority in the decode chain. */ + if (nb != &mce_srao_nb && nb->priority == INT_MAX) + nb->priority -= 1; + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); - drain_mcelog_buffer(); + + if (drain) + drain_mcelog_buffer(); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -462,61 +470,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) } } -/* - * Simple lockless ring to communicate PFNs from the exception handler with the - * process context work function. This is vastly simplified because there's - * only a single reader and a single writer. - */ -#define MCE_RING_SIZE 16 /* we use one entry less */ - -struct mce_ring { - unsigned short start; - unsigned short end; - unsigned long ring[MCE_RING_SIZE]; -}; -static DEFINE_PER_CPU(struct mce_ring, mce_ring); - -/* Runs with CPU affinity in workqueue */ -static int mce_ring_empty(void) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - - return r->start == r->end; -} - -static int mce_ring_get(unsigned long *pfn) -{ - struct mce_ring *r; - int ret = 0; - - *pfn = 0; - get_cpu(); - r = this_cpu_ptr(&mce_ring); - if (r->start == r->end) - goto out; - *pfn = r->ring[r->start]; - r->start = (r->start + 1) % MCE_RING_SIZE; - ret = 1; -out: - put_cpu(); - return ret; -} - -/* Always runs in MCE context with preempt off */ -static int mce_ring_add(unsigned long pfn) -{ - struct mce_ring *r = this_cpu_ptr(&mce_ring); - unsigned next; - - next = (r->end + 1) % MCE_RING_SIZE; - if (next == r->start) - return -1; - r->ring[r->end] = pfn; - wmb(); - r->end = next; - return 0; -} - int mce_available(struct cpuinfo_x86 *c) { if (mca_cfg.disabled) @@ -526,7 +479,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_ring_empty()) + if (!mce_gen_pool_empty() && keventd_up()) schedule_work(&mce_work); } @@ -553,6 +506,27 @@ static void mce_report_event(struct pt_regs *regs) irq_work_queue(&mce_irq_work); } +static int srao_decode_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *mce = (struct mce *)data; + unsigned long pfn; + + if (!mce) + return NOTIFY_DONE; + + if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) { + pfn = mce->addr >> PAGE_SHIFT; + memory_failure(pfn, MCE_VECTOR, 0); + } + + return NOTIFY_OK; +} +static struct notifier_block mce_srao_nb = { + .notifier_call = srao_decode_notifier, + .priority = INT_MAX, +}; + /* * Read ADDR and MISC registers. */ @@ -671,8 +645,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) { if (m.status & MCI_STATUS_ADDRV) { - mce_ring_add(m.addr >> PAGE_SHIFT); - mce_schedule_work(); + m.severity = severity; + m.usable_addr = mce_usable_address(&m); + + if (!mce_gen_pool_add(&m)) + mce_schedule_work(); } } @@ -1142,15 +1119,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_read_aux(&m, i); - /* - * Action optional error. Queue address for later processing. - * When the ring overflows we just ignore the AO error. - * RED-PEN add some logging mechanism when - * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 - */ - if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) - mce_ring_add(m.addr >> PAGE_SHIFT); + /* assuming valid severity level != 0 */ + m.severity = severity; + m.usable_addr = mce_usable_address(&m); + mce_gen_pool_add(&m); mce_log(&m); @@ -1246,14 +1218,11 @@ int memory_failure(unsigned long pfn, int vector, int flags) /* * Action optional processing happens here (picking up * from the list of faulting pages that do_machine_check() - * placed into the "ring"). + * placed into the genpool). */ static void mce_process_work(struct work_struct *dummy) { - unsigned long pfn; - - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR, 0); + mce_gen_pool_process(); } #ifdef CONFIG_X86_MCE_INTEL @@ -2059,6 +2028,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mce_register_decode_chain(&mce_srao_nb, false); mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_process_work); @@ -2597,5 +2567,20 @@ static int __init mcheck_debugfs_init(void) return 0; } -late_initcall(mcheck_debugfs_init); +#else +static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif + +static int __init mcheck_late_init(void) +{ + mcheck_debugfs_init(); + + /* + * Flush out everything that has been logged during early boot, now that + * everything has been initialized (workqueues, decoders, ...). + */ + mce_schedule_work(); + + return 0; +} +late_initcall(mcheck_late_init); -- cgit v1.2.1 From f29a7aff4bd60ebc3da4982f80144a4158c4c74a Mon Sep 17 00:00:00 2001 From: "Chen, Gong" Date: Wed, 12 Aug 2015 18:29:37 +0200 Subject: x86/mce: Avoid potential deadlock due to printk() in MCE context Printing in MCE context is a no-no, currently, as printk() is not NMI-safe. If some of the notifiers on the MCE chain call do so, we may deadlock. In order to avoid that, delay printk() to process context where it is safe. Reported-by: Xie XiuQi Signed-off-by: Chen, Gong [ Fold in subsequent patch from Boris for early boot logging. ] Signed-off-by: Tony Luck [ Kick irq_work in mce_log() directly. ] Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439396985-12812-6-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-apei.c | 1 - arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index a1aef9533154..34c89a3e8260 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) m.addr = mem_err->physical_addr; mce_log(&m); - mce_notify_irq(); } EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 82603690b65c..9568bb55bfe2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -159,7 +159,8 @@ void mce_log(struct mce *mce) /* Emit the trace record: */ trace_mce_record(mce); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (!mce_gen_pool_add(mce)) + irq_work_queue(&mce_irq_work); mce->finished = 0; wmb(); @@ -1122,7 +1123,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* assuming valid severity level != 0 */ m.severity = severity; m.usable_addr = mce_usable_address(&m); - mce_gen_pool_add(&m); mce_log(&m); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 844f56c5616d..70f567f774ed 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -246,7 +246,6 @@ static void intel_threshold_interrupt(void) return; machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); - mce_notify_irq(); } /* -- cgit v1.2.1 From eef4dfa0cb83899c782935ac5345532f47073cea Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Aug 2015 18:29:38 +0200 Subject: x86/mce: Kill drain_mcelog_buffer() This used to flush out MCEs logged during early boot and which were in the MCA registers from a previous system run. No need for that now, since we've moved to a genpool. Suggested-by: Tony Luck Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1439396985-12812-7-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 44 ++-------------------------------------- 2 files changed, 3 insertions(+), 43 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index dfaa4de1dbb4..982dfc3679ad 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -140,7 +140,7 @@ struct mce_vendor_flags { extern struct mce_vendor_flags mce_flags; extern struct mca_config mca_cfg; -extern void mce_register_decode_chain(struct notifier_block *nb, bool drain); +extern void mce_register_decode_chain(struct notifier_block *nb); extern void mce_unregister_decode_chain(struct notifier_block *nb); #include diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9568bb55bfe2..32b586ee006a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -199,55 +199,15 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -static void drain_mcelog_buffer(void) -{ - unsigned int next, i, prev = 0; - - next = ACCESS_ONCE(mcelog.next); - - do { - struct mce *m; - - /* drain what was logged during boot */ - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - unsigned retries = 1; - - m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2*retries)) - retries++; - - cpu_relax(); - - if (!m->finished && retries >= 4) { - pr_err("skipping error being logged currently!\n"); - break; - } - } - smp_rmb(); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); - } - - memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); -} - static struct notifier_block mce_srao_nb; -void mce_register_decode_chain(struct notifier_block *nb, bool drain) +void mce_register_decode_chain(struct notifier_block *nb) { /* Ensure SRAO notifier has the highest priority in the decode chain. */ if (nb != &mce_srao_nb && nb->priority == INT_MAX) nb->priority -= 1; atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); - - if (drain) - drain_mcelog_buffer(); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); @@ -2028,7 +1988,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); - mce_register_decode_chain(&mce_srao_nb, false); + mce_register_decode_chain(&mce_srao_nb); mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_process_work); -- cgit v1.2.1 From 4d1d5cdc345d15e09518a2410f7fcd069465ffac Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 12 Aug 2015 18:29:39 +0200 Subject: x86/mce: Remove unused function declarations Remove unused function declarations. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1439396985-12812-8-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 982dfc3679ad..38d3a1a8830f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -185,16 +185,12 @@ void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); -void lmce_clear(void); -void lmce_enable(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} -static inline void lmce_clear(void) {} -static inline void lmce_enable(void) {} #endif #ifdef CONFIG_X86_MCE_AMD -- cgit v1.2.1 From 8838eb6c0bf3b6a6494a163947ab3d1700ab45d2 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 12 Aug 2015 18:29:40 +0200 Subject: x86/mce: Clear Local MCE opt-in before kexec kexec could boot a kernel that could be legacy with no knowledge of LMCE. Hence we should make sure we clear LMCE optin before kexec reboot. Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Aravind Gopalakrishnan Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac Link: http://lkml.kernel.org/r/1439396985-12812-9-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 4 ++++ arch/x86/kernel/cpu/mcheck/mce.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/mcheck/mce_intel.c | 19 ++++++++++++++++++- arch/x86/kernel/process.c | 2 ++ arch/x86/kernel/smp.c | 2 ++ 5 files changed, 56 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 38d3a1a8830f..2dbc0bf2b9f3 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -151,10 +151,12 @@ extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE int mcheck_init(void); void mcheck_cpu_init(struct cpuinfo_x86 *c); +void mcheck_cpu_clear(struct cpuinfo_x86 *c); void mcheck_vendor_init_severity(void); #else static inline int mcheck_init(void) { return 0; } static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} +static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {} static inline void mcheck_vendor_init_severity(void) {} #endif @@ -181,12 +183,14 @@ DECLARE_PER_CPU(struct device *, mce_device); #ifdef CONFIG_X86_MCE_INTEL void mce_intel_feature_init(struct cpuinfo_x86 *c); +void mce_intel_feature_clear(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); void cmci_rediscover(void); void cmci_recheck(void); #else static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } +static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { } static inline void cmci_clear(void) {} static inline void cmci_reenable(void) {} static inline void cmci_rediscover(void) {} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 32b586ee006a..ee5272d77a16 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1606,6 +1606,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) } } +static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) +{ + switch (c->x86_vendor) { + case X86_VENDOR_INTEL: + mce_intel_feature_clear(c); + break; + default: + break; + } +} + static void mce_start_timer(unsigned int cpu, struct timer_list *t) { unsigned long iv = check_interval * HZ; @@ -1672,6 +1683,25 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_timer(); } +/* + * Called for each booted CPU to clear some machine checks opt-ins + */ +void mcheck_cpu_clear(struct cpuinfo_x86 *c) +{ + if (mca_cfg.disabled) + return; + + if (!mce_available(c)) + return; + + /* + * Possibly to clear general settings generic to x86 + * __mcheck_cpu_clear_generic(c); + */ + __mcheck_cpu_clear_vendor(c); + +} + /* * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log. */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 70f567f774ed..c5c003291861 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -434,7 +434,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -void intel_init_lmce(void) +static void intel_init_lmce(void) { u64 val; @@ -447,9 +447,26 @@ void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } +static void intel_clear_lmce(void) +{ + u64 val; + + if (!lmce_supported()) + return; + + rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + val &= ~MCG_EXT_CTL_LMCE_EN; + wrmsrl(MSR_IA32_MCG_EXT_CTL, val); +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); } + +void mce_intel_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 397688beed4b..b20ef187ff41 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -319,6 +320,7 @@ void stop_this_cpu(void *dummy) */ set_cpu_online(smp_processor_id(), false); disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); for (;;) halt(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 15aaa69bbb5e..12c8286206ce 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -30,6 +30,7 @@ #include #include #include +#include #include /* * Some notes on x86 processor bugs affecting SMP operation: @@ -243,6 +244,7 @@ static void native_stop_other_cpus(int wait) finish: local_irq_save(flags); disable_local_APIC(); + mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); local_irq_restore(flags); } -- cgit v1.2.1 From 1b48465500611a2dc5e75800c61ac352e22d41c3 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Wed, 12 Aug 2015 18:29:41 +0200 Subject: x86/mce: Reenable CMCI banks when swiching back to interrupt mode Zhang Liguang reported the following issue: 1) System detects a CMCI storm on the current CPU. 2) Kernel disables the CMCI interrupt on banks owned by the current CPU and switches to poll mode 3) After the CMCI storm subsides, kernel switches back to interrupt mode 4) We expect the system to reenable the CMCI interrupt on banks owned by the current CPU mce_intel_adjust_timer |-> cmci_reenable |-> cmci_discover # owned banks are ignored here static void cmci_discover(int banks) ... for (i = 0; i < banks; i++) { ... if (test_bit(i, owned)) # ownd banks is ignore here continue; So convert cmci_storm_disable_banks() to cmci_toggle_interrupt_mode() which controls whether to enable or disable CMCI interrupts with its argument. NB: We cannot clear the owned bit because the banks won't be polled, otherwise. See: 27f6c573e0f7 ("x86, CMCI: Add proper detection of end of CMCI storms") for more info. Reported-by: Zhang Liguang Signed-off-by: Xie XiuQi Signed-off-by: Borislav Petkov Cc: # v3.15+ Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: huawei.libin@huawei.com Cc: linux-edac Cc: rui.xiang@huawei.com Link: http://lkml.kernel.org/r/1439396985-12812-10-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel.c | 41 +++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index c5c003291861..1e8bb6c94f14 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu) per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; } +static void cmci_toggle_interrupt_mode(bool on) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + raw_spin_lock_irqsave(&cmci_discover_lock, flags); + owned = this_cpu_ptr(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + + if (on) + val |= MCI_CTL2_CMCI_EN; + else + val &= ~MCI_CTL2_CMCI_EN; + + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + unsigned long cmci_intel_adjust_timer(unsigned long interval) { if ((this_cpu_read(cmci_backoff_cnt) > 0) && @@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval) */ if (!atomic_read(&cmci_storm_on_cpus)) { __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); - cmci_reenable(); + cmci_toggle_interrupt_mode(true); cmci_recheck(); } return CMCI_POLL_INTERVAL; @@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval) } } -static void cmci_storm_disable_banks(void) -{ - unsigned long flags, *owned; - int bank; - u64 val; - - raw_spin_lock_irqsave(&cmci_discover_lock, flags); - owned = this_cpu_ptr(mce_banks_owned); - for_each_set_bit(bank, owned, MAX_NR_BANKS) { - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); - val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - } - raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -} - static bool cmci_storm_detect(void) { unsigned int cnt = __this_cpu_read(cmci_storm_cnt); @@ -223,7 +228,7 @@ static bool cmci_storm_detect(void) if (cnt <= CMCI_STORM_THRESHOLD) return false; - cmci_storm_disable_banks(); + cmci_toggle_interrupt_mode(false); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); mce_timer_kick(CMCI_STORM_INTERVAL); -- cgit v1.2.1 From 9a7783d02197f299f71b4fa2364a345c05f92b83 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Aug 2015 18:29:43 +0200 Subject: x86/mce: Rename rcu_dereference_check_mce() to mce_log_get_idx_check() The "rcu_" prefix misleads for it being a proper RCU interface which is not. It basically checks whether we're preemptible or holding the chrdev_read mutex. Rename it accordingly. Signed-off-by: Borislav Petkov Acked-by: Paul E. McKenney Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1439396985-12812-12-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ee5272d77a16..b979711452a5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -52,11 +52,11 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); -#define rcu_dereference_check_mce(p) \ +#define mce_log_get_idx_check(p) \ ({ \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious rcu_dereference_check_mce() usage"); \ + "suspicious mce_log_get_idx_check() usage"); \ smp_load_acquire(&(p)); \ }) @@ -165,7 +165,7 @@ void mce_log(struct mce *mce) mce->finished = 0; wmb(); for (;;) { - entry = rcu_dereference_check_mce(mcelog.next); + entry = mce_log_get_idx_check(mcelog.next); for (;;) { /* @@ -1812,7 +1812,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf, goto out; } - next = rcu_dereference_check_mce(mcelog.next); + next = mce_log_get_idx_check(mcelog.next); /* Only supports full reads right now */ err = -EINVAL; -- cgit v1.2.1 From a79da38494ec23f1a7d6ee734e07e9575fd18b58 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Aug 2015 18:29:44 +0200 Subject: x86/mce: Add a wrapper around mce_log() for injection Will be used by an injector module in a following patch. Additionally, add a missing module export reported by 0-DAY kernel test. Reported-by: kbuild test robot Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1439396985-12812-13-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 2 ++ arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ea8b62264c14..547720efd923 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -79,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id) return -EINVAL; } #endif + +void mce_inject_log(struct mce *m); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b979711452a5..e4e6646cac46 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -199,6 +199,14 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void mce_inject_log(struct mce *m) +{ + mutex_lock(&mce_chrdev_read_mutex); + mce_log(m); + mutex_unlock(&mce_chrdev_read_mutex); +} +EXPORT_SYMBOL_GPL(mce_inject_log); + static struct notifier_block mce_srao_nb; void mce_register_decode_chain(struct notifier_block *nb) -- cgit v1.2.1 From 6c36dfe949187dc2729abfad4b083758ac5c2e0e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 12 Aug 2015 18:29:45 +0200 Subject: x86/ras: Move AMD MCE injector to arch/x86/ras/ This is an x86-specific module and would benefit from being closer to the arch code. Move it there. Update copyright while at it. Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1439396985-12812-14-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 2 + arch/x86/ras/Kconfig | 11 ++ arch/x86/ras/Makefile | 2 + arch/x86/ras/mce_amd_inj.c | 375 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 390 insertions(+) create mode 100644 arch/x86/ras/Kconfig create mode 100644 arch/x86/ras/Makefile create mode 100644 arch/x86/ras/mce_amd_inj.c (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 118e6debc483..0f38418719ab 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -212,6 +212,8 @@ drivers-$(CONFIG_PM) += arch/x86/power/ drivers-$(CONFIG_FB) += arch/x86/video/ +drivers-$(CONFIG_RAS) += arch/x86/ras/ + #### # boot loader support. Several targets are kept for legacy purposes diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig new file mode 100644 index 000000000000..10fea5fc821e --- /dev/null +++ b/arch/x86/ras/Kconfig @@ -0,0 +1,11 @@ +config AMD_MCE_INJ + tristate "Simple MCE injection interface for AMD processors" + depends on RAS && EDAC_DECODE_MCE && DEBUG_FS + default n + help + This is a simple debugfs interface to inject MCEs and test different + aspects of the MCE handling code. + + WARNING: Do not even assume this interface is staying stable! + + diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile new file mode 100644 index 000000000000..dd2c98b84037 --- /dev/null +++ b/arch/x86/ras/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o + diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c new file mode 100644 index 000000000000..17e35b5bf779 --- /dev/null +++ b/arch/x86/ras/mce_amd_inj.c @@ -0,0 +1,375 @@ +/* + * A simple MCE injection facility for testing different aspects of the RAS + * code. This driver should be built as module so that it can be loaded + * on production kernels for testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010-15: Borislav Petkov + * Advanced Micro Devices Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kernel/cpu/mcheck/mce-internal.h" + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; +static struct dentry *dfs_inj; + +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + +#define MCE_INJECT_SET(reg) \ +static int inj_##reg##_set(void *data, u64 val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + m->reg = val; \ + return 0; \ +} + +MCE_INJECT_SET(status); +MCE_INJECT_SET(misc); +MCE_INJECT_SET(addr); + +#define MCE_INJECT_GET(reg) \ +static int inj_##reg##_get(void *data, u64 *val) \ +{ \ + struct mce *m = (struct mce *)data; \ + \ + *val = m->reg; \ + return 0; \ +} + +MCE_INJECT_GET(status); +MCE_INJECT_GET(misc); +MCE_INJECT_GET(addr); + +DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); + +/* + * Caller needs to be make sure this cpu doesn't disappear + * from under us, i.e.: get_cpu/put_cpu. + */ +static int toggle_hw_mce_inject(unsigned int cpu, bool enable) +{ + u32 l, h; + int err; + + err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h); + if (err) { + pr_err("%s: error reading HWCR\n", __func__); + return err; + } + + enable ? (l |= BIT(18)) : (l &= ~BIT(18)); + + err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h); + if (err) + pr_err("%s: error writing HWCR\n", __func__); + + return err; +} + +static int __set_inj(const char *buf) +{ + int i; + + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} + +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); +} + +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + size_t ret; + + if (cnt > MAX_FLAG_OPT_SIZE) + cnt = MAX_FLAG_OPT_SIZE; + + ret = cnt; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += ret; + + return ret; +} + +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; + +/* + * On which CPU to inject? + */ +MCE_INJECT_GET(extcpu); + +static int inj_extcpu_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= nr_cpu_ids || !cpu_online(val)) { + pr_err("%s: Invalid CPU: %llu\n", __func__, val); + return -EINVAL; + } + m->extcpu = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n"); + +static void trigger_mce(void *info) +{ + asm volatile("int $18"); +} + +static void do_inject(void) +{ + u64 mcg_status = 0; + unsigned int cpu = i_mce.extcpu; + u8 b = i_mce.bank; + + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (inj_type == SW_INJ) { + mce_inject_log(&i_mce); + return; + } + + /* prep MCE global settings for the injection */ + mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; + + if (!(i_mce.status & MCI_STATUS_PCC)) + mcg_status |= MCG_STATUS_RIPV; + + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + + toggle_hw_mce_inject(cpu, true); + + wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, + (u32)mcg_status, (u32)(mcg_status >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b), + (u32)i_mce.status, (u32)(i_mce.status >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b), + (u32)i_mce.addr, (u32)(i_mce.addr >> 32)); + + wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b), + (u32)i_mce.misc, (u32)(i_mce.misc >> 32)); + + toggle_hw_mce_inject(cpu, false); + + smp_call_function_single(cpu, trigger_mce, NULL, 0); + +err: + put_online_cpus(); + +} + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static int inj_bank_set(void *data, u64 val) +{ + struct mce *m = (struct mce *)data; + + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; + } + + m->bank = val; + do_inject(); + + return 0; +} + +MCE_INJECT_GET(bank); + +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; + +static struct dfs_node { + char *name; + struct dentry *d; + const struct file_operations *fops; + umode_t perm; +} dfs_fls[] = { + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, +}; + +static int __init init_mce_inject(void) +{ + int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; + + dfs_inj = debugfs_create_dir("mce-inject", NULL); + if (!dfs_inj) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { + dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, + dfs_fls[i].perm, + dfs_inj, + &i_mce, + dfs_fls[i].fops); + + if (!dfs_fls[i].d) + goto err_dfs_add; + } + + return 0; + +err_dfs_add: + while (--i >= 0) + debugfs_remove(dfs_fls[i].d); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; + + return -ENOMEM; +} + +static void __exit exit_mce_inject(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_remove(dfs_fls[i].d); + + memset(&dfs_fls, 0, sizeof(dfs_fls)); + + debugfs_remove(dfs_inj); + dfs_inj = NULL; +} +module_init(init_mce_inject); +module_exit(exit_mce_inject); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Borislav Petkov "); +MODULE_AUTHOR("AMD Inc."); +MODULE_DESCRIPTION("MCE injection facility for RAS testing"); -- cgit v1.2.1