From ca755e0a49ff1272efff0b3bfdf3f1e0b0fc5d57 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Sep 2010 16:32:20 +0200 Subject: EDAC: Fix error return We should return a negative value when we cannot get the toplevel edac sysfs class. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mc_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 8aad94d10c0c..aa93ad82ee07 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -1011,7 +1011,7 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci) */ int edac_sysfs_setup_mc_kset(void) { - int err = 0; + int err = -EINVAL; struct sysdev_class *edac_class; debugf1("%s()\n", __func__); -- cgit v1.2.1 From c9f281fd96b29367363ee232021c030d025c52a8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Aug 2010 18:21:42 +0200 Subject: EDAC, MCE: Add HW_ERR prefix .. so that the user knows what she's looking at there in dmesg. Also, fix a minor cosmetic output inconsistency. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 9014df6f605d..564fe7c75345 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -133,7 +133,7 @@ static void amd_decode_dc_mce(u64 mc0_status) u32 ec = mc0_status & 0xffff; u32 xec = (mc0_status >> 16) & 0xf; - pr_emerg("Data Cache Error"); + pr_emerg(HW_ERR "Data Cache Error: "); if (xec == 1 && TLB_ERROR(ec)) pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); @@ -168,7 +168,7 @@ static void amd_decode_dc_mce(u64 mc0_status) return; wrong_dc_mce: - pr_warning("Corrupted DC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); } static void amd_decode_ic_mce(u64 mc1_status) @@ -176,7 +176,7 @@ static void amd_decode_ic_mce(u64 mc1_status) u32 ec = mc1_status & 0xffff; u32 xec = (mc1_status >> 16) & 0xf; - pr_emerg("Instruction Cache Error"); + pr_emerg(HW_ERR "Instruction Cache Error"); if (xec == 1 && TLB_ERROR(ec)) pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); @@ -225,7 +225,7 @@ static void amd_decode_ic_mce(u64 mc1_status) return; wrong_ic_mce: - pr_warning("Corrupted IC MCE info?\n"); + pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); } static void amd_decode_bu_mce(u64 mc2_status) @@ -233,7 +233,7 @@ static void amd_decode_bu_mce(u64 mc2_status) u32 ec = mc2_status & 0xffff; u32 xec = (mc2_status >> 16) & 0xf; - pr_emerg("Bus Unit Error"); + pr_emerg(HW_ERR "Bus Unit Error"); if (xec == 0x1) pr_cont(" in the write data buffers.\n"); @@ -267,7 +267,7 @@ static void amd_decode_bu_mce(u64 mc2_status) return; wrong_bu_mce: - pr_warning("Corrupted BU MCE info?\n"); + pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); } static void amd_decode_ls_mce(u64 mc3_status) @@ -275,7 +275,7 @@ static void amd_decode_ls_mce(u64 mc3_status) u32 ec = mc3_status & 0xffff; u32 xec = (mc3_status >> 16) & 0xf; - pr_emerg("Load Store Error"); + pr_emerg(HW_ERR "Load Store Error"); if (xec == 0x0) { u8 rrrr = (ec >> 4) & 0xf; @@ -288,7 +288,7 @@ static void amd_decode_ls_mce(u64 mc3_status) return; wrong_ls_mce: - pr_warning("Corrupted LS MCE info?\n"); + pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) @@ -304,7 +304,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) if (TLB_ERROR(ec) && !report_gart_errors) return; - pr_emerg("Northbridge Error, node %d", node_id); + pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); /* * F10h, revD can disable ErrCpu[3:0] so check that first and also the @@ -323,7 +323,7 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) pr_cont("\n"); } - pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl)); + pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl)); if (BUS_ERROR(ec) && nb_bus_decoder) nb_bus_decoder(node_id, regs); @@ -334,26 +334,26 @@ static void amd_decode_fr_mce(u64 mc5_status) { /* we have only one error signature so match all fields at once. */ if ((mc5_status & 0xffff) == 0x0f0f) - pr_emerg(" FR Error: CPU Watchdog timer expire.\n"); + pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); else - pr_warning("Corrupted FR MCE info?\n"); + pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); } static inline void amd_decode_err_code(unsigned int ec) { if (TLB_ERROR(ec)) { - pr_emerg("Transaction: %s, Cache Level %s\n", + pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg("Transaction: %s, Type: %s, Cache Level: %s", + pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, " + pr_emerg(HW_ERR "Transaction type: %s(%s), %s, Cache Level: %s, " "Participating Processor: %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); } else - pr_warning("Huh? Unknown MCE error 0x%x\n", ec); + pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); } static int amd_decode_mce(struct notifier_block *nb, unsigned long val, @@ -363,7 +363,7 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, struct err_regs regs; int node, ecc; - pr_emerg("MC%d_STATUS: ", m->bank); + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); pr_cont("%sorrected error, other errors lost: %s, " "CPU context corrupt: %s", -- cgit v1.2.1 From 0ee8efa8f4672ce35ee370291c0f21d7b87b1e3f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Aug 2010 12:34:19 +0200 Subject: EDAC, MCE: Remove unused function parameter Remove remains from previous functionality. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac_dbg.c | 2 +- drivers/edac/edac_mce_amd.c | 7 ++----- drivers/edac/edac_mce_amd.h | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 59cf2cf6e11e..22ef3fecf569 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, /* Process the Mapping request */ /* TODO: Add race prevention */ - amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1); + amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info); return count; } diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 564fe7c75345..765d7fbfa2d5 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -291,13 +291,10 @@ wrong_ls_mce: pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } -void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors) +void amd_decode_nb_mce(int node_id, struct err_regs *regs) { u32 ec = ERROR_CODE(regs->nbsl); - if (!handle_errors) - return; - /* * GART TLB error reporting is disabled by default. Bail out early. */ @@ -402,7 +399,7 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, regs.nbeah = (u32)(m->addr >> 32); node = amd_get_nb_id(m->extcpu); - amd_decode_nb_mce(node, ®s, 1); + amd_decode_nb_mce(node, ®s); break; case 5: diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index df23ee065f79..8920133075e9 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -64,6 +64,6 @@ struct err_regs { void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); -void amd_decode_nb_mce(int, struct err_regs *, int); +void amd_decode_nb_mce(int, struct err_regs *); #endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.1 From 6337583d7dc0dced36ab98dd63de2389c95c22d9 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Sep 2010 18:13:39 +0200 Subject: EDAC, MCE: Sanitize error codes Clean up error codes names, shorten to mnemonics, add RRRR boundary checking. Signed-off-by: Borislav Petkov --- drivers/edac/edac_mce_amd.c | 63 ++++++++++++--------------------------------- drivers/edac/edac_mce_amd.h | 5 ++-- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 765d7fbfa2d5..d0e850eea50a 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -30,62 +30,31 @@ EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); * string representation for the different MCA reported error types, see F3x48 * or MSR0000_0411. */ -const char *tt_msgs[] = { /* transaction type */ - "instruction", - "data", - "generic", - "reserved" -}; + +/* transaction type */ +const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; EXPORT_SYMBOL_GPL(tt_msgs); -const char *ll_msgs[] = { /* cache level */ - "L0", - "L1", - "L2", - "L3/generic" -}; +/* cache level */ +const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; EXPORT_SYMBOL_GPL(ll_msgs); +/* memory transaction type */ const char *rrrr_msgs[] = { - "generic", - "generic read", - "generic write", - "data read", - "data write", - "inst fetch", - "prefetch", - "evict", - "snoop", - "reserved RRRR= 9", - "reserved RRRR= 10", - "reserved RRRR= 11", - "reserved RRRR= 12", - "reserved RRRR= 13", - "reserved RRRR= 14", - "reserved RRRR= 15" + "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" }; EXPORT_SYMBOL_GPL(rrrr_msgs); -const char *pp_msgs[] = { /* participating processor */ - "local node originated (SRC)", - "local node responded to request (RES)", - "local node observed as 3rd party (OBS)", - "generic" -}; +/* participating processor */ +const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; EXPORT_SYMBOL_GPL(pp_msgs); -const char *to_msgs[] = { - "no timeout", - "timed out" -}; +/* request timeout */ +const char *to_msgs[] = { "no timeout", "timed out" }; EXPORT_SYMBOL_GPL(to_msgs); -const char *ii_msgs[] = { /* memory or i/o */ - "mem access", - "reserved", - "i/o access", - "generic" -}; +/* memory or i/o */ +const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); /* @@ -336,16 +305,16 @@ static void amd_decode_fr_mce(u64 mc5_status) pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); } -static inline void amd_decode_err_code(unsigned int ec) +static inline void amd_decode_err_code(u16 ec) { if (TLB_ERROR(ec)) { pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", TT_MSG(ec), LL_MSG(ec)); } else if (MEM_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s", + pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); } else if (BUS_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction type: %s(%s), %s, Cache Level: %s, " + pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " "Participating Processor: %s\n", RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec)); diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 8920133075e9..2ee499d7f898 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -20,13 +20,14 @@ #define II_MSG(x) ii_msgs[II(x)] #define LL(x) (((x) >> 0) & 0x3) #define LL_MSG(x) ll_msgs[LL(x)] -#define RRRR(x) (((x) >> 4) & 0xf) -#define RRRR_MSG(x) rrrr_msgs[RRRR(x)] #define TO(x) (((x) >> 8) & 0x1) #define TO_MSG(x) to_msgs[TO(x)] #define PP(x) (((x) >> 9) & 0x3) #define PP_MSG(x) pp_msgs[PP(x)] +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!") + #define K8_NBSH 0x4C #define K8_NBSH_VALID_BIT BIT(31) -- cgit v1.2.1 From 7cfd4a87441f5ca3018fdd1f7ad67e8a73a05dc2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Sep 2010 14:45:20 +0200 Subject: EDAC, MCE: Pass complete MCE info to decoders ... instead of the MCi_STATUS info only for improved handling of certain types of errors later. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 13 ++++++-- drivers/edac/amd64_edac_dbg.c | 10 ++++-- drivers/edac/edac_mce_amd.c | 74 ++++++++++++++++++++----------------------- drivers/edac/edac_mce_amd.h | 6 ++-- 4 files changed, 56 insertions(+), 47 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index e7d5d6b5dcf6..76f7cc0ee149 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci, amd64_handle_ue(mci, info); } -void amd64_decode_bus_error(int node_id, struct err_regs *regs) +void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg) { struct mem_ctl_info *mci = mci_lookup[node_id]; + struct err_regs regs; - __amd64_decode_bus_error(mci, regs); + regs.nbsl = (u32) m->status; + regs.nbsh = (u32)(m->status >> 32); + regs.nbeal = (u32) m->addr; + regs.nbeah = (u32)(m->addr >> 32); + regs.nbcfg = nbcfg; + + __amd64_decode_bus_error(mci, ®s); /* * Check the UE bit of the NB status high register, if set generate some @@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs) * * FIXME: this should go somewhere else, if at all. */ - if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) + if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors) edac_mc_handle_ue_no_info(mci, "UE bit is set"); } diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index 22ef3fecf569..f6d5695de5b6 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -10,11 +10,14 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, size_t count) { struct amd64_pvt *pvt = mci->pvt_info; - unsigned long long value; + u64 value; int ret = 0; + struct mce m; ret = strict_strtoull(data, 16, &value); if (ret != -EINVAL) { + struct err_regs *regs = &pvt->ctl_error_info; + debugf0("received NBEA= 0x%llx\n", value); /* place the value into the virtual error packet */ @@ -22,9 +25,12 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, value >>= 32; pvt->ctl_error_info.nbeah = (u32) value; + m.addr = value; + m.status = regs->nbsl | ((u64)regs->nbsh << 32); + /* Process the Mapping request */ /* TODO: Add race prevention */ - amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info); + amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg); return count; } diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index d0e850eea50a..6cfa881888bc 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -2,7 +2,7 @@ #include "edac_mce_amd.h" static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct err_regs *regs); +static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); void amd_report_gart_errors(bool v) { @@ -10,13 +10,13 @@ void amd_report_gart_errors(bool v) } EXPORT_SYMBOL_GPL(amd_report_gart_errors); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)) +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) { nb_bus_decoder = f; } EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)) +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) { if (nb_bus_decoder) { WARN_ON(nb_bus_decoder != f); @@ -97,17 +97,17 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); -static void amd_decode_dc_mce(u64 mc0_status) +static void amd_decode_dc_mce(struct mce *m) { - u32 ec = mc0_status & 0xffff; - u32 xec = (mc0_status >> 16) & 0xf; + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; pr_emerg(HW_ERR "Data Cache Error: "); if (xec == 1 && TLB_ERROR(ec)) pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); else if (xec == 0) { - if (mc0_status & (1ULL << 40)) + if (m->status & (1ULL << 40)) pr_cont(" during Data Scrub.\n"); else if (TLB_ERROR(ec)) pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); @@ -140,10 +140,10 @@ wrong_dc_mce: pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); } -static void amd_decode_ic_mce(u64 mc1_status) +static void amd_decode_ic_mce(struct mce *m) { - u32 ec = mc1_status & 0xffff; - u32 xec = (mc1_status >> 16) & 0xf; + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; pr_emerg(HW_ERR "Instruction Cache Error"); @@ -154,7 +154,7 @@ static void amd_decode_ic_mce(u64 mc1_status) pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); else if (BUS_ERROR(ec)) { if (boot_cpu_data.x86 == 0xf && - (mc1_status & (1ULL << 58))) + (m->status & BIT(58))) pr_cont(" during system linefill.\n"); else pr_cont(" during attempted NB data read.\n"); @@ -197,10 +197,10 @@ wrong_ic_mce: pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); } -static void amd_decode_bu_mce(u64 mc2_status) +static void amd_decode_bu_mce(struct mce *m) { - u32 ec = mc2_status & 0xffff; - u32 xec = (mc2_status >> 16) & 0xf; + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; pr_emerg(HW_ERR "Bus Unit Error"); @@ -239,10 +239,10 @@ wrong_bu_mce: pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); } -static void amd_decode_ls_mce(u64 mc3_status) +static void amd_decode_ls_mce(struct mce *m) { - u32 ec = mc3_status & 0xffff; - u32 xec = (mc3_status >> 16) & 0xf; + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; pr_emerg(HW_ERR "Load Store Error"); @@ -260,9 +260,11 @@ wrong_ls_mce: pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } -void amd_decode_nb_mce(int node_id, struct err_regs *regs) +void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) { - u32 ec = ERROR_CODE(regs->nbsl); + u32 ec = m->status & 0xffff; + u32 nbsh = (u32)(m->status >> 32); + u32 nbsl = (u32)m->status; /* * GART TLB error reporting is disabled by default. Bail out early. @@ -278,10 +280,10 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs) */ if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model > 7)) { - if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); + if (nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); } else { - u8 assoc_cpus = regs->nbsh & 0xf; + u8 assoc_cpus = nbsh & 0xf; if (assoc_cpus > 0) pr_cont(", core: %d", fls(assoc_cpus) - 1); @@ -289,17 +291,17 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs) pr_cont("\n"); } - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(regs->nbsl)); + pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); if (BUS_ERROR(ec) && nb_bus_decoder) - nb_bus_decoder(node_id, regs); + nb_bus_decoder(node_id, m, nbcfg); } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); -static void amd_decode_fr_mce(u64 mc5_status) +static void amd_decode_fr_mce(struct mce *m) { /* we have only one error signature so match all fields at once. */ - if ((mc5_status & 0xffff) == 0x0f0f) + if ((m->status & 0xffff) == 0x0f0f) pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); else pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); @@ -326,7 +328,6 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; - struct err_regs regs; int node, ecc; pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); @@ -346,33 +347,28 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, switch (m->bank) { case 0: - amd_decode_dc_mce(m->status); + amd_decode_dc_mce(m); break; case 1: - amd_decode_ic_mce(m->status); + amd_decode_ic_mce(m); break; case 2: - amd_decode_bu_mce(m->status); + amd_decode_bu_mce(m); break; case 3: - amd_decode_ls_mce(m->status); + amd_decode_ls_mce(m); break; case 4: - regs.nbsl = (u32) m->status; - regs.nbsh = (u32)(m->status >> 32); - regs.nbeal = (u32) m->addr; - regs.nbeah = (u32)(m->addr >> 32); - node = amd_get_nb_id(m->extcpu); - - amd_decode_nb_mce(node, ®s); + node = amd_get_nb_id(m->extcpu); + amd_decode_nb_mce(node, m, 0); break; case 5: - amd_decode_fr_mce(m->status); + amd_decode_fr_mce(m); break; default: diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 2ee499d7f898..0fba0e76c25f 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -63,8 +63,8 @@ struct err_regs { void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); -void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); -void amd_decode_nb_mce(int, struct err_regs *); +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_decode_nb_mce(int, struct mce *, u32); #endif /* _EDAC_MCE_AMD_H */ -- cgit v1.2.1 From 30e1f7a8122145f44f45c95366e27b6bb0b08428 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Sep 2010 17:26:48 +0200 Subject: EDAC: Export edac sysfs class to users. Move toplevel sysfs class to the stub and make it available to non-modularized code too. Add proper refcounting of its users and move the registration functionality into the reference counting routines. Signed-off-by: Borislav Petkov --- drivers/edac/edac_device_sysfs.c | 16 +++++--- drivers/edac/edac_mc_sysfs.c | 9 +++-- drivers/edac/edac_module.c | 79 +--------------------------------------- drivers/edac/edac_module.h | 1 - drivers/edac/edac_pci_sysfs.c | 10 +++-- drivers/edac/edac_stub.c | 51 ++++++++++++++++++++++++-- include/linux/edac.h | 4 ++ 7 files changed, 75 insertions(+), 95 deletions(-) diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c index 070968178a24..2941dca91aae 100644 --- a/drivers/edac/edac_device_sysfs.c +++ b/drivers/edac/edac_device_sysfs.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "edac_core.h" #include "edac_module.h" @@ -235,7 +236,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) debugf1("%s()\n", __func__); /* get the /sys/devices/system/edac reference */ - edac_class = edac_get_edac_class(); + edac_class = edac_get_sysfs_class(); if (edac_class == NULL) { debugf1("%s() no edac_class error\n", __func__); err = -ENODEV; @@ -255,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) if (!try_module_get(edac_dev->owner)) { err = -ENODEV; - goto err_out; + goto err_mod_get; } /* register */ @@ -282,6 +283,9 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev) err_kobj_reg: module_put(edac_dev->owner); +err_mod_get: + edac_put_sysfs_class(); + err_out: return err; } @@ -290,12 +294,11 @@ err_out: * edac_device_unregister_sysfs_main_kobj: * the '..../edac/' kobject */ -void edac_device_unregister_sysfs_main_kobj( - struct edac_device_ctl_info *edac_dev) +void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev) { debugf0("%s()\n", __func__); debugf4("%s() name of kobject is: %s\n", - __func__, kobject_name(&edac_dev->kobj)); + __func__, kobject_name(&dev->kobj)); /* * Unregister the edac device's kobject and @@ -304,7 +307,8 @@ void edac_device_unregister_sysfs_main_kobj( * a) module_put() this module * b) 'kfree' the memory */ - kobject_put(&edac_dev->kobj); + kobject_put(&dev->kobj); + edac_put_sysfs_class(); } /* edac_dev -> instance information */ diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index aa93ad82ee07..a4135860149b 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -11,6 +11,7 @@ #include #include +#include #include #include "edac_core.h" @@ -1017,7 +1018,7 @@ int edac_sysfs_setup_mc_kset(void) debugf1("%s()\n", __func__); /* get the /sys/devices/system/edac class reference */ - edac_class = edac_get_edac_class(); + edac_class = edac_get_sysfs_class(); if (edac_class == NULL) { debugf1("%s() no edac_class error=%d\n", __func__, err); goto fail_out; @@ -1028,15 +1029,16 @@ int edac_sysfs_setup_mc_kset(void) if (!mc_kset) { err = -ENOMEM; debugf1("%s() Failed to register '.../edac/mc'\n", __func__); - goto fail_out; + goto fail_kset; } debugf1("%s() Registered '.../edac/mc' kobject\n", __func__); return 0; +fail_kset: + edac_put_sysfs_class(); - /* error unwind stack */ fail_out: return err; } @@ -1049,5 +1051,6 @@ fail_out: void edac_sysfs_teardown_mc_kset(void) { kset_unregister(mc_kset); + edac_put_sysfs_class(); } diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c index 7e1374afd967..be4b075c3098 100644 --- a/drivers/edac/edac_module.c +++ b/drivers/edac/edac_module.c @@ -26,15 +26,6 @@ EXPORT_SYMBOL_GPL(edac_debug_level); /* scope is to module level only */ struct workqueue_struct *edac_workqueue; -/* - * sysfs object: /sys/devices/system/edac - * need to export to other files in this modules - */ -static struct sysdev_class edac_class = { - .name = "edac", -}; -static int edac_class_valid; - /* * edac_op_state_to_string() */ @@ -54,60 +45,6 @@ char *edac_op_state_to_string(int opstate) return "UNKNOWN"; } -/* - * edac_get_edac_class() - * - * return pointer to the edac class of 'edac' - */ -struct sysdev_class *edac_get_edac_class(void) -{ - struct sysdev_class *classptr = NULL; - - if (edac_class_valid) - classptr = &edac_class; - - return classptr; -} - -/* - * edac_register_sysfs_edac_name() - * - * register the 'edac' into /sys/devices/system - * - * return: - * 0 success - * !0 error - */ -static int edac_register_sysfs_edac_name(void) -{ - int err; - - /* create the /sys/devices/system/edac directory */ - err = sysdev_class_register(&edac_class); - - if (err) { - debugf1("%s() error=%d\n", __func__, err); - return err; - } - - edac_class_valid = 1; - return 0; -} - -/* - * sysdev_class_unregister() - * - * unregister the 'edac' from /sys/devices/system - */ -static void edac_unregister_sysfs_edac_name(void) -{ - /* only if currently registered, then unregister it */ - if (edac_class_valid) - sysdev_class_unregister(&edac_class); - - edac_class_valid = 0; -} - /* * edac_workqueue_setup * initialize the edac work queue for polling operations @@ -153,22 +90,12 @@ static int __init edac_init(void) */ edac_pci_clear_parity_errors(); - /* - * perform the registration of the /sys/devices/system/edac class object - */ - if (edac_register_sysfs_edac_name()) { - edac_printk(KERN_ERR, EDAC_MC, - "Error initializing 'edac' kobject\n"); - err = -ENODEV; - goto error; - } - /* * now set up the mc_kset under the edac class object */ err = edac_sysfs_setup_mc_kset(); if (err) - goto sysfs_setup_fail; + goto error; /* Setup/Initialize the workq for this core */ err = edac_workqueue_setup(); @@ -183,9 +110,6 @@ static int __init edac_init(void) workq_fail: edac_sysfs_teardown_mc_kset(); -sysfs_setup_fail: - edac_unregister_sysfs_edac_name(); - error: return err; } @@ -201,7 +125,6 @@ static void __exit edac_exit(void) /* tear down the various subsystems */ edac_workqueue_teardown(); edac_sysfs_teardown_mc_kset(); - edac_unregister_sysfs_edac_name(); } /* diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h index 233d4798c3aa..17aabb7b90ec 100644 --- a/drivers/edac/edac_module.h +++ b/drivers/edac/edac_module.h @@ -42,7 +42,6 @@ extern void edac_device_unregister_sysfs_main_kobj( struct edac_device_ctl_info *edac_dev); extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev); extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev); -extern struct sysdev_class *edac_get_edac_class(void); /* edac core workqueue: single CPU mode */ extern struct workqueue_struct *edac_workqueue; diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index c39697df9cb4..023b01cb5175 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -7,7 +7,7 @@ * */ #include -#include +#include #include #include @@ -354,7 +354,7 @@ static int edac_pci_main_kobj_setup(void) /* First time, so create the main kobject and its * controls and atributes */ - edac_class = edac_get_edac_class(); + edac_class = edac_get_sysfs_class(); if (edac_class == NULL) { debugf1("%s() no edac_class\n", __func__); err = -ENODEV; @@ -368,7 +368,7 @@ static int edac_pci_main_kobj_setup(void) if (!try_module_get(THIS_MODULE)) { debugf1("%s() try_module_get() failed\n", __func__); err = -ENODEV; - goto decrement_count_fail; + goto mod_get_fail; } edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); @@ -403,6 +403,9 @@ kobject_init_and_add_fail: kzalloc_fail: module_put(THIS_MODULE); +mod_get_fail: + edac_put_sysfs_class(); + decrement_count_fail: /* if are on this error exit, nothing to tear down */ atomic_dec(&edac_pci_sysfs_refcount); @@ -429,6 +432,7 @@ static void edac_pci_main_kobj_teardown(void) __func__); kobject_put(edac_pci_top_main_kobj); } + edac_put_sysfs_class(); } /* diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index 20b428aa155e..aab970760b75 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -3,10 +3,13 @@ * * Author: Dave Jiang * - * 2007 (c) MontaVista Software, Inc. This file is licensed under - * the terms of the GNU General Public License version 2. This program - * is licensed "as is" without any warranty of any kind, whether express - * or implied. + * 2007 (c) MontaVista Software, Inc. + * 2010 (c) Advanced Micro Devices Inc. + * Borislav Petkov + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. * */ #include @@ -23,6 +26,8 @@ EXPORT_SYMBOL_GPL(edac_handlers); int edac_err_assert = 0; EXPORT_SYMBOL_GPL(edac_err_assert); +static atomic_t edac_class_valid = ATOMIC_INIT(0); + /* * called to determine if there is an EDAC driver interested in * knowing an event (such as NMI) occurred @@ -44,3 +49,41 @@ void edac_atomic_assert_error(void) edac_err_assert++; } EXPORT_SYMBOL_GPL(edac_atomic_assert_error); + +/* + * sysfs object: /sys/devices/system/edac + * need to export to other files + */ +struct sysdev_class edac_class = { + .name = "edac", +}; +EXPORT_SYMBOL_GPL(edac_class); + +/* return pointer to the 'edac' node in sysfs */ +struct sysdev_class *edac_get_sysfs_class(void) +{ + int err = 0; + + if (atomic_read(&edac_class_valid)) + goto out; + + /* create the /sys/devices/system/edac directory */ + err = sysdev_class_register(&edac_class); + if (err) { + printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n"); + return NULL; + } + +out: + atomic_inc(&edac_class_valid); + return &edac_class; +} +EXPORT_SYMBOL_GPL(edac_get_sysfs_class); + +void edac_put_sysfs_class(void) +{ + /* last user unregisters it */ + if (atomic_dec_and_test(&edac_class_valid)) + sysdev_class_unregister(&edac_class); +} +EXPORT_SYMBOL_GPL(edac_put_sysfs_class); diff --git a/include/linux/edac.h b/include/linux/edac.h index 7cf92e8a4196..36c66443bdfd 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -13,6 +13,7 @@ #define _LINUX_EDAC_H_ #include +#include #define EDAC_OPSTATE_INVAL -1 #define EDAC_OPSTATE_POLL 0 @@ -22,9 +23,12 @@ extern int edac_op_state; extern int edac_err_assert; extern atomic_t edac_handlers; +extern struct sysdev_class edac_class; extern int edac_handler_set(void); extern void edac_atomic_assert_error(void); +extern struct sysdev_class *edac_get_sysfs_class(void); +extern void edac_put_sysfs_class(void); static inline void opstate_init(void) { -- cgit v1.2.1 From 9cdeb404a1870c5022915e576dbdc3cde21af5bf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 2 Sep 2010 18:33:24 +0200 Subject: EDAC, MCE: Rework MCE injection Add sysfs injection facilities for testing of the MCE decoding code. Remove large parts of amd64_edac_dbg.c, as a result, which did only NB MCE injection anyway and the new injection code supports that functionality already. Add an injection module so that MCE decoding code in production kernels like those in RHEL and SLES can be tested. Signed-off-by: Borislav Petkov --- drivers/edac/Kconfig | 14 ++- drivers/edac/Makefile | 2 + drivers/edac/amd64_edac.h | 2 +- drivers/edac/amd64_edac_dbg.c | 213 +++--------------------------------------- drivers/edac/edac_mce_amd.c | 4 +- drivers/edac/edac_mce_amd.h | 4 +- drivers/edac/mce_amd_inj.c | 171 +++++++++++++++++++++++++++++++++ 7 files changed, 203 insertions(+), 207 deletions(-) create mode 100644 drivers/edac/mce_amd_inj.c diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 70bb350de996..3bb3a671baa0 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -39,7 +39,7 @@ config EDAC_DEBUG there're four debug levels (x=0,1,2,3 from low to high). Usually you should select 'N'. - config EDAC_DECODE_MCE +config EDAC_DECODE_MCE tristate "Decode MCEs in human-readable form (only on AMD for now)" depends on CPU_SUP_AMD && X86_MCE default y @@ -51,6 +51,16 @@ config EDAC_DEBUG which occur really early upon boot, before the module infrastructure has been initialized. +config EDAC_MCE_INJ + tristate "Simple MCE injection interface over /sysfs" + depends on EDAC_DECODE_MCE + default n + help + This is a simple interface to inject MCEs over /sysfs and test + the MCE decoding code in EDAC. + + This is currently AMD-only. + config EDAC_MM_EDAC tristate "Main Memory EDAC (Error Detection And Correction) reporting" help @@ -72,7 +82,7 @@ config EDAC_AMD64 Families of Memory Controllers (K8, F10h and F11h) config EDAC_AMD64_ERROR_INJECTION - bool "Sysfs Error Injection facilities" + bool "Sysfs HW Error injection facilities" depends on EDAC_AMD64 help Recent Opterons (Family 10h and later) provide for Memory Error diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index ca6b1bb24ccc..5c38ad38f3a3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -17,6 +17,8 @@ ifdef CONFIG_PCI edac_core-objs += edac_pci.o edac_pci_sysfs.o endif +obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o + obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 613b9381e71a..67d9ceb4b839 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -486,7 +486,7 @@ extern const char *ext_msgs[32]; extern const char *htlink_msgs[8]; #ifdef CONFIG_EDAC_DEBUG -#define NUM_DBG_ATTRS 9 +#define NUM_DBG_ATTRS 5 #else #define NUM_DBG_ATTRS 0 #endif diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c index f6d5695de5b6..e3562288f4ce 100644 --- a/drivers/edac/amd64_edac_dbg.c +++ b/drivers/edac/amd64_edac_dbg.c @@ -1,173 +1,16 @@ #include "amd64_edac.h" -/* - * accept a hex value and store it into the virtual error register file, field: - * nbeal and nbeah. Assume virtual error values have already been set for: NBSL, - * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and - * CHANNEL - */ -static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data, - size_t count) -{ - struct amd64_pvt *pvt = mci->pvt_info; - u64 value; - int ret = 0; - struct mce m; - - ret = strict_strtoull(data, 16, &value); - if (ret != -EINVAL) { - struct err_regs *regs = &pvt->ctl_error_info; - - debugf0("received NBEA= 0x%llx\n", value); - - /* place the value into the virtual error packet */ - pvt->ctl_error_info.nbeal = (u32) value; - value >>= 32; - pvt->ctl_error_info.nbeah = (u32) value; - - m.addr = value; - m.status = regs->nbsl | ((u64)regs->nbsh << 32); - - /* Process the Mapping request */ - /* TODO: Add race prevention */ - amd_decode_nb_mce(pvt->mc_node_id, &m, regs->nbcfg); - - return count; - } - return ret; +#define EDAC_DCT_ATTR_SHOW(reg) \ +static ssize_t amd64_##reg##_show(struct mem_ctl_info *mci, char *data) \ +{ \ + struct amd64_pvt *pvt = mci->pvt_info; \ + return sprintf(data, "0x%016llx\n", (u64)pvt->reg); \ } -/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */ -static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - u64 value; - - value = pvt->ctl_error_info.nbeah; - value <<= 32; - value |= pvt->ctl_error_info.nbeal; - - return sprintf(data, "%llx\n", value); -} - -/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */ -static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data, - size_t count) -{ - struct amd64_pvt *pvt = mci->pvt_info; - unsigned long value; - int ret = 0; - - ret = strict_strtoul(data, 16, &value); - if (ret != -EINVAL) { - debugf0("received NBSL= 0x%lx\n", value); - - pvt->ctl_error_info.nbsl = (u32) value; - - return count; - } - return ret; -} - -/* display back what the last NBSL value written */ -static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - u32 value; - - value = pvt->ctl_error_info.nbsl; - - return sprintf(data, "%x\n", value); -} - -/* store the NBSH (MCA NB Status High) value user desires */ -static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data, - size_t count) -{ - struct amd64_pvt *pvt = mci->pvt_info; - unsigned long value; - int ret = 0; - - ret = strict_strtoul(data, 16, &value); - if (ret != -EINVAL) { - debugf0("received NBSH= 0x%lx\n", value); - - pvt->ctl_error_info.nbsh = (u32) value; - - return count; - } - return ret; -} - -/* display back what the last NBSH value written */ -static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - u32 value; - - value = pvt->ctl_error_info.nbsh; - - return sprintf(data, "%x\n", value); -} - -/* accept and store the NBCFG (MCA NB Configuration) value user desires */ -static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci, - const char *data, size_t count) -{ - struct amd64_pvt *pvt = mci->pvt_info; - unsigned long value; - int ret = 0; - - ret = strict_strtoul(data, 16, &value); - if (ret != -EINVAL) { - debugf0("received NBCFG= 0x%lx\n", value); - - pvt->ctl_error_info.nbcfg = (u32) value; - - return count; - } - return ret; -} - -/* various show routines for the controls of a MCI */ -static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - - return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg); -} - - -static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - - return sprintf(data, "%x\n", pvt->dhar); -} - - -static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - - return sprintf(data, "%x\n", pvt->dbam0); -} - - -static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - - return sprintf(data, "%llx\n", pvt->top_mem); -} - - -static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data) -{ - struct amd64_pvt *pvt = mci->pvt_info; - - return sprintf(data, "%llx\n", pvt->top_mem2); -} +EDAC_DCT_ATTR_SHOW(dhar); +EDAC_DCT_ATTR_SHOW(dbam0); +EDAC_DCT_ATTR_SHOW(top_mem); +EDAC_DCT_ATTR_SHOW(top_mem2); static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data) { @@ -186,38 +29,6 @@ static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data) */ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = { - { - .attr = { - .name = "nbea_ctl", - .mode = (S_IRUGO | S_IWUSR) - }, - .show = amd64_nbea_show, - .store = amd64_nbea_store, - }, - { - .attr = { - .name = "nbsl_ctl", - .mode = (S_IRUGO | S_IWUSR) - }, - .show = amd64_nbsl_show, - .store = amd64_nbsl_store, - }, - { - .attr = { - .name = "nbsh_ctl", - .mode = (S_IRUGO | S_IWUSR) - }, - .show = amd64_nbsh_show, - .store = amd64_nbsh_store, - }, - { - .attr = { - .name = "nbcfg_ctl", - .mode = (S_IRUGO | S_IWUSR) - }, - .show = amd64_nbcfg_show, - .store = amd64_nbcfg_store, - }, { .attr = { .name = "dhar", @@ -231,7 +42,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = { .name = "dbam", .mode = (S_IRUGO) }, - .show = amd64_dbam_show, + .show = amd64_dbam0_show, .store = NULL, }, { @@ -239,7 +50,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = { .name = "topmem", .mode = (S_IRUGO) }, - .show = amd64_topmem_show, + .show = amd64_top_mem_show, .store = NULL, }, { @@ -247,7 +58,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = { .name = "topmem2", .mode = (S_IRUGO) }, - .show = amd64_topmem2_show, + .show = amd64_top_mem2_show, .store = NULL, }, { diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 6cfa881888bc..c75c47b0f3ea 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -324,8 +324,7 @@ static inline void amd_decode_err_code(u16 ec) pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); } -static int amd_decode_mce(struct notifier_block *nb, unsigned long val, - void *data) +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; int node, ecc; @@ -379,6 +378,7 @@ static int amd_decode_mce(struct notifier_block *nb, unsigned long val, return NOTIFY_STOP; } +EXPORT_SYMBOL_GPL(amd_decode_mce); static struct notifier_block amd_mce_dec_nb = { .notifier_call = amd_decode_mce, diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h index 0fba0e76c25f..2712a906afdf 100644 --- a/drivers/edac/edac_mce_amd.h +++ b/drivers/edac/edac_mce_amd.h @@ -1,6 +1,8 @@ #ifndef _EDAC_MCE_AMD_H #define _EDAC_MCE_AMD_H +#include + #include #define ERROR_CODE(x) ((x) & 0xffff) @@ -61,10 +63,10 @@ struct err_regs { u32 nbeal; }; - void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); void amd_decode_nb_mce(int, struct mce *, u32); +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); #endif /* _EDAC_MCE_AMD_H */ diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c new file mode 100644 index 000000000000..0e4f2dcf3bd6 --- /dev/null +++ b/drivers/edac/mce_amd_inj.c @@ -0,0 +1,171 @@ +/* + * A simple MCE injection facility for testing the MCE decoding code. This + * driver should be built as module so that it can be loaded on production + * kernels for testing purposes. + * + * This file may be distributed under the terms of the GNU General Public + * License version 2. + * + * Copyright (c) 2010: Borislav Petkov + * Advanced Micro Devices Inc. + */ + +#include +#include +#include +#include + +#include "edac_mce_amd.h" + +struct edac_mce_attr { + struct attribute attr; + ssize_t (*show) (struct kobject *kobj, struct edac_mce_attr *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, struct edac_mce_attr *attr, + const char *buf, size_t count); +}; + +#define EDAC_MCE_ATTR(_name, _mode, _show, _store) \ +static struct edac_mce_attr mce_attr_##_name = __ATTR(_name, _mode, _show, _store) + +static struct kobject *mce_kobj; + +/* + * Collect all the MCi_XXX settings + */ +static struct mce i_mce; + +#define MCE_INJECT_STORE(reg) \ +static ssize_t edac_inject_##reg##_store(struct kobject *kobj, \ + struct edac_mce_attr *attr, \ + const char *data, size_t count)\ +{ \ + int ret = 0; \ + unsigned long value; \ + \ + ret = strict_strtoul(data, 16, &value); \ + if (ret < 0) \ + printk(KERN_ERR "Error writing MCE " #reg " field.\n"); \ + \ + i_mce.reg = value; \ + \ + return count; \ +} + +MCE_INJECT_STORE(status); +MCE_INJECT_STORE(misc); +MCE_INJECT_STORE(addr); + +#define MCE_INJECT_SHOW(reg) \ +static ssize_t edac_inject_##reg##_show(struct kobject *kobj, \ + struct edac_mce_attr *attr, \ + char *buf) \ +{ \ + return sprintf(buf, "0x%016llx\n", i_mce.reg); \ +} + +MCE_INJECT_SHOW(status); +MCE_INJECT_SHOW(misc); +MCE_INJECT_SHOW(addr); + +EDAC_MCE_ATTR(status, 0644, edac_inject_status_show, edac_inject_status_store); +EDAC_MCE_ATTR(misc, 0644, edac_inject_misc_show, edac_inject_misc_store); +EDAC_MCE_ATTR(addr, 0644, edac_inject_addr_show, edac_inject_addr_store); + +/* + * This denotes into which bank we're injecting and triggers + * the injection, at the same time. + */ +static ssize_t edac_inject_bank_store(struct kobject *kobj, + struct edac_mce_attr *attr, + const char *data, size_t count) +{ + int ret = 0; + unsigned long value; + + ret = strict_strtoul(data, 10, &value); + if (ret < 0) { + printk(KERN_ERR "Invalid bank value!\n"); + return -EINVAL; + } + + if (value > 5) { + printk(KERN_ERR "Non-existant MCE bank: %lu\n", value); + return -EINVAL; + } + + i_mce.bank = value; + + amd_decode_mce(NULL, 0, &i_mce); + + return count; +} + +static ssize_t edac_inject_bank_show(struct kobject *kobj, + struct edac_mce_attr *attr, char *buf) +{ + return sprintf(buf, "%d\n", i_mce.bank); +} + +EDAC_MCE_ATTR(bank, 0644, edac_inject_bank_show, edac_inject_bank_store); + +static struct edac_mce_attr *sysfs_attrs[] = { &mce_attr_status, &mce_attr_misc, + &mce_attr_addr, &mce_attr_bank +}; + +static int __init edac_init_mce_inject(void) +{ + struct sysdev_class *edac_class = NULL; + int i, err = 0; + + edac_class = edac_get_sysfs_class(); + if (!edac_class) + return -EINVAL; + + mce_kobj = kobject_create_and_add("mce", &edac_class->kset.kobj); + if (!mce_kobj) { + printk(KERN_ERR "Error creating a mce kset.\n"); + err = -ENOMEM; + goto err_mce_kobj; + } + + for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) { + err = sysfs_create_file(mce_kobj, &sysfs_attrs[i]->attr); + if (err) { + printk(KERN_ERR "Error creating %s in sysfs.\n", + sysfs_attrs[i]->attr.name); + goto err_sysfs_create; + } + } + return 0; + +err_sysfs_create: + while (i-- >= 0) + sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr); + + kobject_del(mce_kobj); + +err_mce_kobj: + edac_put_sysfs_class(); + + return err; +} + +static void __exit edac_exit_mce_inject(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) + sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr); + + kobject_del(mce_kobj); + + edac_put_sysfs_class(); +} + +module_init(edac_init_mce_inject); +module_exit(edac_exit_mce_inject); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Borislav Petkov "); +MODULE_AUTHOR("AMD Inc."); +MODULE_DESCRIPTION("MCE injection facility for testing MCE decoding"); -- cgit v1.2.1 From 47ca08a40b043815134d489e21870b53276f1a4a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 27 Sep 2010 15:30:39 +0200 Subject: EDAC, MCE: Rename files Drop "edac_" string from the filenames since they're prefixed with edac/ in their pathname anyway. Signed-off-by: Borislav Petkov --- drivers/edac/Makefile | 1 + drivers/edac/amd64_edac.h | 2 +- drivers/edac/edac_mce_amd.c | 414 -------------------------------------------- drivers/edac/edac_mce_amd.h | 72 -------- drivers/edac/mce_amd.c | 414 ++++++++++++++++++++++++++++++++++++++++++++ drivers/edac/mce_amd.h | 72 ++++++++ drivers/edac/mce_amd_inj.c | 2 +- 7 files changed, 489 insertions(+), 488 deletions(-) delete mode 100644 drivers/edac/edac_mce_amd.c delete mode 100644 drivers/edac/edac_mce_amd.h create mode 100644 drivers/edac/mce_amd.c create mode 100644 drivers/edac/mce_amd.h diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 5c38ad38f3a3..32c7bc93c525 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -19,6 +19,7 @@ endif obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o +edac_mce_amd-objs := mce_amd.o obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 67d9ceb4b839..13e1d6f25bd1 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -72,7 +72,7 @@ #include #include #include "edac_core.h" -#include "edac_mce_amd.h" +#include "mce_amd.h" #define amd64_printk(level, fmt, arg...) \ edac_printk(level, "amd64", fmt, ##arg) diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c deleted file mode 100644 index c75c47b0f3ea..000000000000 --- a/drivers/edac/edac_mce_amd.c +++ /dev/null @@ -1,414 +0,0 @@ -#include -#include "edac_mce_amd.h" - -static bool report_gart_errors; -static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); - -void amd_report_gart_errors(bool v) -{ - report_gart_errors = v; -} -EXPORT_SYMBOL_GPL(amd_report_gart_errors); - -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) -{ - nb_bus_decoder = f; -} -EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); - -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) -{ - if (nb_bus_decoder) { - WARN_ON(nb_bus_decoder != f); - - nb_bus_decoder = NULL; - } -} -EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); - -/* - * string representation for the different MCA reported error types, see F3x48 - * or MSR0000_0411. - */ - -/* transaction type */ -const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; -EXPORT_SYMBOL_GPL(tt_msgs); - -/* cache level */ -const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; -EXPORT_SYMBOL_GPL(ll_msgs); - -/* memory transaction type */ -const char *rrrr_msgs[] = { - "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" -}; -EXPORT_SYMBOL_GPL(rrrr_msgs); - -/* participating processor */ -const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; -EXPORT_SYMBOL_GPL(pp_msgs); - -/* request timeout */ -const char *to_msgs[] = { "no timeout", "timed out" }; -EXPORT_SYMBOL_GPL(to_msgs); - -/* memory or i/o */ -const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; -EXPORT_SYMBOL_GPL(ii_msgs); - -/* - * Map the 4 or 5 (family-specific) bits of Extended Error code to the - * string table. - */ -const char *ext_msgs[] = { - "K8 ECC error", /* 0_0000b */ - "CRC error on link", /* 0_0001b */ - "Sync error packets on link", /* 0_0010b */ - "Master Abort during link operation", /* 0_0011b */ - "Target Abort during link operation", /* 0_0100b */ - "Invalid GART PTE entry during table walk", /* 0_0101b */ - "Unsupported atomic RMW command received", /* 0_0110b */ - "WDT error: NB transaction timeout", /* 0_0111b */ - "ECC/ChipKill ECC error", /* 0_1000b */ - "SVM DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link/L3/Probe Filter Protocol error", /* 0_1011b */ - "NB Internal Arrays Parity error", /* 0_1100b */ - "DRAM Address/Control Parity error", /* 0_1101b */ - "Link Transmission error", /* 0_1110b */ - "GART/DEV Table Walk Data error" /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "ECC error in L3 Cache Data", /* 1_1100b */ - "L3 Cache Tag error", /* 1_1101b */ - "L3 Cache LRU Parity error", /* 1_1110b */ - "Probe Filter error" /* 1_1111b */ -}; -EXPORT_SYMBOL_GPL(ext_msgs); - -static void amd_decode_dc_mce(struct mce *m) -{ - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; - - pr_emerg(HW_ERR "Data Cache Error: "); - - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (m->status & (1ULL << 40)) - pr_cont(" during Data Scrub.\n"); - else if (TLB_ERROR(ec)) - pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); - else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 tt = (ec >> 2) & 0x3; - u8 rrrr = (ec >> 4) & 0xf; - - /* see F10h BKDG (31116), Table 92. */ - if (ll == 0x1) { - if (tt != 0x1) - goto wrong_dc_mce; - - pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); - - } else if (ll == 0x2 && rrrr == 0x3) - pr_cont(" during L1 linefill from L2.\n"); - else - goto wrong_dc_mce; - } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) - pr_cont(" during system linefill.\n"); - else - goto wrong_dc_mce; - } else - goto wrong_dc_mce; - - return; - -wrong_dc_mce: - pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); -} - -static void amd_decode_ic_mce(struct mce *m) -{ - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; - - pr_emerg(HW_ERR "Instruction Cache Error"); - - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (TLB_ERROR(ec)) - pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); - else if (BUS_ERROR(ec)) { - if (boot_cpu_data.x86 == 0xf && - (m->status & BIT(58))) - pr_cont(" during system linefill.\n"); - else - pr_cont(" during attempted NB data read.\n"); - } else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 rrrr = (ec >> 4) & 0xf; - - if (ll == 0x2) - pr_cont(" during a linefill from L2.\n"); - else if (ll == 0x1) { - - switch (rrrr) { - case 0x5: - pr_cont(": Parity error during " - "data load.\n"); - break; - - case 0x7: - pr_cont(": Copyback Parity/Victim" - " error.\n"); - break; - - case 0x8: - pr_cont(": Tag Snoop error.\n"); - break; - - default: - goto wrong_ic_mce; - break; - } - } - } else - goto wrong_ic_mce; - } else - goto wrong_ic_mce; - - return; - -wrong_ic_mce: - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); -} - -static void amd_decode_bu_mce(struct mce *m) -{ - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; - - pr_emerg(HW_ERR "Bus Unit Error"); - - if (xec == 0x1) - pr_cont(" in the write data buffers.\n"); - else if (xec == 0x3) - pr_cont(" in the victim data buffers.\n"); - else if (xec == 0x2 && MEM_ERROR(ec)) - pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); - else if (xec == 0x0) { - if (TLB_ERROR(ec)) - pr_cont(": %s error in a Page Descriptor Cache or " - "Guest TLB.\n", TT_MSG(ec)); - else if (BUS_ERROR(ec)) - pr_cont(": %s/ECC error in data read from NB: %s.\n", - RRRR_MSG(ec), PP_MSG(ec)); - else if (MEM_ERROR(ec)) { - u8 rrrr = (ec >> 4) & 0xf; - - if (rrrr >= 0x7) - pr_cont(": %s error during data copyback.\n", - RRRR_MSG(ec)); - else if (rrrr <= 0x1) - pr_cont(": %s parity/ECC error during data " - "access from L2.\n", RRRR_MSG(ec)); - else - goto wrong_bu_mce; - } else - goto wrong_bu_mce; - } else - goto wrong_bu_mce; - - return; - -wrong_bu_mce: - pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); -} - -static void amd_decode_ls_mce(struct mce *m) -{ - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; - - pr_emerg(HW_ERR "Load Store Error"); - - if (xec == 0x0) { - u8 rrrr = (ec >> 4) & 0xf; - - if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) - goto wrong_ls_mce; - - pr_cont(" during %s.\n", RRRR_MSG(ec)); - } - return; - -wrong_ls_mce: - pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); -} - -void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) -{ - u32 ec = m->status & 0xffff; - u32 nbsh = (u32)(m->status >> 32); - u32 nbsl = (u32)m->status; - - /* - * GART TLB error reporting is disabled by default. Bail out early. - */ - if (TLB_ERROR(ec) && !report_gart_errors) - return; - - pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); - - /* - * F10h, revD can disable ErrCpu[3:0] so check that first and also the - * value encoding has changed so interpret those differently - */ - if ((boot_cpu_data.x86 == 0x10) && - (boot_cpu_data.x86_model > 7)) { - if (nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); - } else { - u8 assoc_cpus = nbsh & 0xf; - - if (assoc_cpus > 0) - pr_cont(", core: %d", fls(assoc_cpus) - 1); - - pr_cont("\n"); - } - - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); - - if (BUS_ERROR(ec) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); -} -EXPORT_SYMBOL_GPL(amd_decode_nb_mce); - -static void amd_decode_fr_mce(struct mce *m) -{ - /* we have only one error signature so match all fields at once. */ - if ((m->status & 0xffff) == 0x0f0f) - pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); - else - pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); -} - -static inline void amd_decode_err_code(u16 ec) -{ - if (TLB_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", - TT_MSG(ec), LL_MSG(ec)); - } else if (MEM_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", - RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); - } else if (BUS_ERROR(ec)) { - pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " - "Participating Processor: %s\n", - RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), - PP_MSG(ec)); - } else - pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); -} - -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) -{ - struct mce *m = (struct mce *)data; - int node, ecc; - - pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); - - pr_cont("%sorrected error, other errors lost: %s, " - "CPU context corrupt: %s", - ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), - ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), - ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); - - /* do the two bits[14:13] together */ - ecc = (m->status >> 45) & 0x3; - if (ecc) - pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); - - pr_cont("\n"); - - switch (m->bank) { - case 0: - amd_decode_dc_mce(m); - break; - - case 1: - amd_decode_ic_mce(m); - break; - - case 2: - amd_decode_bu_mce(m); - break; - - case 3: - amd_decode_ls_mce(m); - break; - - case 4: - node = amd_get_nb_id(m->extcpu); - amd_decode_nb_mce(node, m, 0); - break; - - case 5: - amd_decode_fr_mce(m); - break; - - default: - break; - } - - amd_decode_err_code(m->status & 0xffff); - - return NOTIFY_STOP; -} -EXPORT_SYMBOL_GPL(amd_decode_mce); - -static struct notifier_block amd_mce_dec_nb = { - .notifier_call = amd_decode_mce, -}; - -static int __init mce_amd_init(void) -{ - /* - * We can decode MCEs for K8, F10h and F11h CPUs: - */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) - return 0; - - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) - return 0; - - atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); - - return 0; -} -early_initcall(mce_amd_init); - -#ifdef MODULE -static void __exit mce_amd_exit(void) -{ - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); -} - -MODULE_DESCRIPTION("AMD MCE decoder"); -MODULE_ALIAS("edac-mce-amd"); -MODULE_LICENSE("GPL"); -module_exit(mce_amd_exit); -#endif diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/edac_mce_amd.h deleted file mode 100644 index 2712a906afdf..000000000000 --- a/drivers/edac/edac_mce_amd.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef _EDAC_MCE_AMD_H -#define _EDAC_MCE_AMD_H - -#include - -#include - -#define ERROR_CODE(x) ((x) & 0xffff) -#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) -#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] - -#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) -#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) - -#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) -#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) -#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) - -#define TT(x) (((x) >> 2) & 0x3) -#define TT_MSG(x) tt_msgs[TT(x)] -#define II(x) (((x) >> 2) & 0x3) -#define II_MSG(x) ii_msgs[II(x)] -#define LL(x) (((x) >> 0) & 0x3) -#define LL_MSG(x) ll_msgs[LL(x)] -#define TO(x) (((x) >> 8) & 0x1) -#define TO_MSG(x) to_msgs[TO(x)] -#define PP(x) (((x) >> 9) & 0x3) -#define PP_MSG(x) pp_msgs[PP(x)] - -#define RRRR(x) (((x) >> 4) & 0xf) -#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!") - -#define K8_NBSH 0x4C - -#define K8_NBSH_VALID_BIT BIT(31) -#define K8_NBSH_OVERFLOW BIT(30) -#define K8_NBSH_UC_ERR BIT(29) -#define K8_NBSH_ERR_EN BIT(28) -#define K8_NBSH_MISCV BIT(27) -#define K8_NBSH_VALID_ERROR_ADDR BIT(26) -#define K8_NBSH_PCC BIT(25) -#define K8_NBSH_ERR_CPU_VAL BIT(24) -#define K8_NBSH_CECC BIT(14) -#define K8_NBSH_UECC BIT(13) -#define K8_NBSH_ERR_SCRUBER BIT(8) - -extern const char *tt_msgs[]; -extern const char *ll_msgs[]; -extern const char *rrrr_msgs[]; -extern const char *pp_msgs[]; -extern const char *to_msgs[]; -extern const char *ii_msgs[]; -extern const char *ext_msgs[]; - -/* - * relevant NB regs - */ -struct err_regs { - u32 nbcfg; - u32 nbsh; - u32 nbsl; - u32 nbeah; - u32 nbeal; -}; - -void amd_report_gart_errors(bool); -void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); -void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); -void amd_decode_nb_mce(int, struct mce *, u32); -int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); - -#endif /* _EDAC_MCE_AMD_H */ diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c new file mode 100644 index 000000000000..5eb8042d0c6a --- /dev/null +++ b/drivers/edac/mce_amd.c @@ -0,0 +1,414 @@ +#include +#include "mce_amd.h" + +static bool report_gart_errors; +static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); + +void amd_report_gart_errors(bool v) +{ + report_gart_errors = v; +} +EXPORT_SYMBOL_GPL(amd_report_gart_errors); + +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)) +{ + nb_bus_decoder = f; +} +EXPORT_SYMBOL_GPL(amd_register_ecc_decoder); + +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)) +{ + if (nb_bus_decoder) { + WARN_ON(nb_bus_decoder != f); + + nb_bus_decoder = NULL; + } +} +EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder); + +/* + * string representation for the different MCA reported error types, see F3x48 + * or MSR0000_0411. + */ + +/* transaction type */ +const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" }; +EXPORT_SYMBOL_GPL(tt_msgs); + +/* cache level */ +const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" }; +EXPORT_SYMBOL_GPL(ll_msgs); + +/* memory transaction type */ +const char *rrrr_msgs[] = { + "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP" +}; +EXPORT_SYMBOL_GPL(rrrr_msgs); + +/* participating processor */ +const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" }; +EXPORT_SYMBOL_GPL(pp_msgs); + +/* request timeout */ +const char *to_msgs[] = { "no timeout", "timed out" }; +EXPORT_SYMBOL_GPL(to_msgs); + +/* memory or i/o */ +const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; +EXPORT_SYMBOL_GPL(ii_msgs); + +/* + * Map the 4 or 5 (family-specific) bits of Extended Error code to the + * string table. + */ +const char *ext_msgs[] = { + "K8 ECC error", /* 0_0000b */ + "CRC error on link", /* 0_0001b */ + "Sync error packets on link", /* 0_0010b */ + "Master Abort during link operation", /* 0_0011b */ + "Target Abort during link operation", /* 0_0100b */ + "Invalid GART PTE entry during table walk", /* 0_0101b */ + "Unsupported atomic RMW command received", /* 0_0110b */ + "WDT error: NB transaction timeout", /* 0_0111b */ + "ECC/ChipKill ECC error", /* 0_1000b */ + "SVM DEV Error", /* 0_1001b */ + "Link Data error", /* 0_1010b */ + "Link/L3/Probe Filter Protocol error", /* 0_1011b */ + "NB Internal Arrays Parity error", /* 0_1100b */ + "DRAM Address/Control Parity error", /* 0_1101b */ + "Link Transmission error", /* 0_1110b */ + "GART/DEV Table Walk Data error" /* 0_1111b */ + "Res 0x100 error", /* 1_0000b */ + "Res 0x101 error", /* 1_0001b */ + "Res 0x102 error", /* 1_0010b */ + "Res 0x103 error", /* 1_0011b */ + "Res 0x104 error", /* 1_0100b */ + "Res 0x105 error", /* 1_0101b */ + "Res 0x106 error", /* 1_0110b */ + "Res 0x107 error", /* 1_0111b */ + "Res 0x108 error", /* 1_1000b */ + "Res 0x109 error", /* 1_1001b */ + "Res 0x10A error", /* 1_1010b */ + "Res 0x10B error", /* 1_1011b */ + "ECC error in L3 Cache Data", /* 1_1100b */ + "L3 Cache Tag error", /* 1_1101b */ + "L3 Cache LRU Parity error", /* 1_1110b */ + "Probe Filter error" /* 1_1111b */ +}; +EXPORT_SYMBOL_GPL(ext_msgs); + +static void amd_decode_dc_mce(struct mce *m) +{ + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Data Cache Error: "); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (m->status & (1ULL << 40)) + pr_cont(" during Data Scrub.\n"); + else if (TLB_ERROR(ec)) + pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + /* see F10h BKDG (31116), Table 92. */ + if (ll == 0x1) { + if (tt != 0x1) + goto wrong_dc_mce; + + pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); + + } else if (ll == 0x2 && rrrr == 0x3) + pr_cont(" during L1 linefill from L2.\n"); + else + goto wrong_dc_mce; + } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) + pr_cont(" during system linefill.\n"); + else + goto wrong_dc_mce; + } else + goto wrong_dc_mce; + + return; + +wrong_dc_mce: + pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); +} + +static void amd_decode_ic_mce(struct mce *m) +{ + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Instruction Cache Error"); + + if (xec == 1 && TLB_ERROR(ec)) + pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); + else if (xec == 0) { + if (TLB_ERROR(ec)) + pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); + else if (BUS_ERROR(ec)) { + if (boot_cpu_data.x86 == 0xf && + (m->status & BIT(58))) + pr_cont(" during system linefill.\n"); + else + pr_cont(" during attempted NB data read.\n"); + } else if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + u8 rrrr = (ec >> 4) & 0xf; + + if (ll == 0x2) + pr_cont(" during a linefill from L2.\n"); + else if (ll == 0x1) { + + switch (rrrr) { + case 0x5: + pr_cont(": Parity error during " + "data load.\n"); + break; + + case 0x7: + pr_cont(": Copyback Parity/Victim" + " error.\n"); + break; + + case 0x8: + pr_cont(": Tag Snoop error.\n"); + break; + + default: + goto wrong_ic_mce; + break; + } + } + } else + goto wrong_ic_mce; + } else + goto wrong_ic_mce; + + return; + +wrong_ic_mce: + pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); +} + +static void amd_decode_bu_mce(struct mce *m) +{ + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Bus Unit Error"); + + if (xec == 0x1) + pr_cont(" in the write data buffers.\n"); + else if (xec == 0x3) + pr_cont(" in the victim data buffers.\n"); + else if (xec == 0x2 && MEM_ERROR(ec)) + pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec)); + else if (xec == 0x0) { + if (TLB_ERROR(ec)) + pr_cont(": %s error in a Page Descriptor Cache or " + "Guest TLB.\n", TT_MSG(ec)); + else if (BUS_ERROR(ec)) + pr_cont(": %s/ECC error in data read from NB: %s.\n", + RRRR_MSG(ec), PP_MSG(ec)); + else if (MEM_ERROR(ec)) { + u8 rrrr = (ec >> 4) & 0xf; + + if (rrrr >= 0x7) + pr_cont(": %s error during data copyback.\n", + RRRR_MSG(ec)); + else if (rrrr <= 0x1) + pr_cont(": %s parity/ECC error during data " + "access from L2.\n", RRRR_MSG(ec)); + else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + } else + goto wrong_bu_mce; + + return; + +wrong_bu_mce: + pr_emerg(HW_ERR "Corrupted BU MCE info?\n"); +} + +static void amd_decode_ls_mce(struct mce *m) +{ + u32 ec = m->status & 0xffff; + u32 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Load Store Error"); + + if (xec == 0x0) { + u8 rrrr = (ec >> 4) & 0xf; + + if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) + goto wrong_ls_mce; + + pr_cont(" during %s.\n", RRRR_MSG(ec)); + } + return; + +wrong_ls_mce: + pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); +} + +void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) +{ + u32 ec = m->status & 0xffff; + u32 nbsh = (u32)(m->status >> 32); + u32 nbsl = (u32)m->status; + + /* + * GART TLB error reporting is disabled by default. Bail out early. + */ + if (TLB_ERROR(ec) && !report_gart_errors) + return; + + pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); + + /* + * F10h, revD can disable ErrCpu[3:0] so check that first and also the + * value encoding has changed so interpret those differently + */ + if ((boot_cpu_data.x86 == 0x10) && + (boot_cpu_data.x86_model > 7)) { + if (nbsh & K8_NBSH_ERR_CPU_VAL) + pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); + } else { + u8 assoc_cpus = nbsh & 0xf; + + if (assoc_cpus > 0) + pr_cont(", core: %d", fls(assoc_cpus) - 1); + + pr_cont("\n"); + } + + pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); + + if (BUS_ERROR(ec) && nb_bus_decoder) + nb_bus_decoder(node_id, m, nbcfg); +} +EXPORT_SYMBOL_GPL(amd_decode_nb_mce); + +static void amd_decode_fr_mce(struct mce *m) +{ + /* we have only one error signature so match all fields at once. */ + if ((m->status & 0xffff) == 0x0f0f) + pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); + else + pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); +} + +static inline void amd_decode_err_code(u16 ec) +{ + if (TLB_ERROR(ec)) { + pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n", + TT_MSG(ec), LL_MSG(ec)); + } else if (MEM_ERROR(ec)) { + pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n", + RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec)); + } else if (BUS_ERROR(ec)) { + pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, " + "Participating Processor: %s\n", + RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), + PP_MSG(ec)); + } else + pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); +} + +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) +{ + struct mce *m = (struct mce *)data; + int node, ecc; + + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); + + pr_cont("%sorrected error, other errors lost: %s, " + "CPU context corrupt: %s", + ((m->status & MCI_STATUS_UC) ? "Unc" : "C"), + ((m->status & MCI_STATUS_OVER) ? "yes" : "no"), + ((m->status & MCI_STATUS_PCC) ? "yes" : "no")); + + /* do the two bits[14:13] together */ + ecc = (m->status >> 45) & 0x3; + if (ecc) + pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U")); + + pr_cont("\n"); + + switch (m->bank) { + case 0: + amd_decode_dc_mce(m); + break; + + case 1: + amd_decode_ic_mce(m); + break; + + case 2: + amd_decode_bu_mce(m); + break; + + case 3: + amd_decode_ls_mce(m); + break; + + case 4: + node = amd_get_nb_id(m->extcpu); + amd_decode_nb_mce(node, m, 0); + break; + + case 5: + amd_decode_fr_mce(m); + break; + + default: + break; + } + + amd_decode_err_code(m->status & 0xffff); + + return NOTIFY_STOP; +} +EXPORT_SYMBOL_GPL(amd_decode_mce); + +static struct notifier_block amd_mce_dec_nb = { + .notifier_call = amd_decode_mce, +}; + +static int __init mce_amd_init(void) +{ + /* + * We can decode MCEs for K8, F10h and F11h CPUs: + */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); + + return 0; +} +early_initcall(mce_amd_init); + +#ifdef MODULE +static void __exit mce_amd_exit(void) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); +} + +MODULE_DESCRIPTION("AMD MCE decoder"); +MODULE_ALIAS("edac-mce-amd"); +MODULE_LICENSE("GPL"); +module_exit(mce_amd_exit); +#endif diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h new file mode 100644 index 000000000000..2712a906afdf --- /dev/null +++ b/drivers/edac/mce_amd.h @@ -0,0 +1,72 @@ +#ifndef _EDAC_MCE_AMD_H +#define _EDAC_MCE_AMD_H + +#include + +#include + +#define ERROR_CODE(x) ((x) & 0xffff) +#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) +#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] + +#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) +#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) + +#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010) +#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100) +#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800) + +#define TT(x) (((x) >> 2) & 0x3) +#define TT_MSG(x) tt_msgs[TT(x)] +#define II(x) (((x) >> 2) & 0x3) +#define II_MSG(x) ii_msgs[II(x)] +#define LL(x) (((x) >> 0) & 0x3) +#define LL_MSG(x) ll_msgs[LL(x)] +#define TO(x) (((x) >> 8) & 0x1) +#define TO_MSG(x) to_msgs[TO(x)] +#define PP(x) (((x) >> 9) & 0x3) +#define PP_MSG(x) pp_msgs[PP(x)] + +#define RRRR(x) (((x) >> 4) & 0xf) +#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!") + +#define K8_NBSH 0x4C + +#define K8_NBSH_VALID_BIT BIT(31) +#define K8_NBSH_OVERFLOW BIT(30) +#define K8_NBSH_UC_ERR BIT(29) +#define K8_NBSH_ERR_EN BIT(28) +#define K8_NBSH_MISCV BIT(27) +#define K8_NBSH_VALID_ERROR_ADDR BIT(26) +#define K8_NBSH_PCC BIT(25) +#define K8_NBSH_ERR_CPU_VAL BIT(24) +#define K8_NBSH_CECC BIT(14) +#define K8_NBSH_UECC BIT(13) +#define K8_NBSH_ERR_SCRUBER BIT(8) + +extern const char *tt_msgs[]; +extern const char *ll_msgs[]; +extern const char *rrrr_msgs[]; +extern const char *pp_msgs[]; +extern const char *to_msgs[]; +extern const char *ii_msgs[]; +extern const char *ext_msgs[]; + +/* + * relevant NB regs + */ +struct err_regs { + u32 nbcfg; + u32 nbsh; + u32 nbsl; + u32 nbeah; + u32 nbeal; +}; + +void amd_report_gart_errors(bool); +void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); +void amd_decode_nb_mce(int, struct mce *, u32); +int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data); + +#endif /* _EDAC_MCE_AMD_H */ diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c index 0e4f2dcf3bd6..8d0688f36d4c 100644 --- a/drivers/edac/mce_amd_inj.c +++ b/drivers/edac/mce_amd_inj.c @@ -15,7 +15,7 @@ #include #include -#include "edac_mce_amd.h" +#include "mce_amd.h" struct edac_mce_attr { struct attribute attr; -- cgit v1.2.1 From 888ab8e6eb2e41179cdc8edf5d0abd1cce0f0370 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 18 Aug 2010 15:11:35 +0200 Subject: EDAC, MCE: Adjust DC decoders to F14h Add a per-family data cache decoders. Since there is a certain overlap between the different DC MCE signatures, reuse functionality between the families as far as possible. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 158 ++++++++++++++++++++++++++++++++++++++++--------- drivers/edac/mce_amd.h | 40 +++++++++++++ 2 files changed, 171 insertions(+), 27 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 5eb8042d0c6a..33985aa61356 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -1,6 +1,10 @@ #include +#include + #include "mce_amd.h" +static struct amd_decoder_ops *fam_ops; + static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); @@ -97,41 +101,116 @@ const char *ext_msgs[] = { }; EXPORT_SYMBOL_GPL(ext_msgs); -static void amd_decode_dc_mce(struct mce *m) +static bool f10h_dc_mce(u16 ec) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; + u8 r4 = (ec >> 4) & 0xf; + bool ret = false; - pr_emerg(HW_ERR "Data Cache Error: "); + if (r4 == R4_GEN) { + pr_cont("during data scrub.\n"); + return true; + } - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (m->status & (1ULL << 40)) - pr_cont(" during Data Scrub.\n"); - else if (TLB_ERROR(ec)) - pr_cont(": %s TLB parity error.\n", LL_MSG(ec)); - else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 tt = (ec >> 2) & 0x3; - u8 rrrr = (ec >> 4) & 0xf; + if (MEM_ERROR(ec)) { + u8 ll = ec & 0x3; + ret = true; - /* see F10h BKDG (31116), Table 92. */ - if (ll == 0x1) { - if (tt != 0x1) - goto wrong_dc_mce; + if (ll == LL_L2) + pr_cont("during L1 linefill from L2.\n"); + else if (ll == LL_L1) + pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec)); + else + ret = false; + } + return ret; +} - pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec)); +static bool k8_dc_mce(u16 ec) +{ + if (BUS_ERROR(ec)) { + pr_cont("during system linefill.\n"); + return true; + } - } else if (ll == 0x2 && rrrr == 0x3) - pr_cont(" during L1 linefill from L2.\n"); - else - goto wrong_dc_mce; - } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf) - pr_cont(" during system linefill.\n"); + return f10h_dc_mce(ec); +} + +static bool f14h_dc_mce(u16 ec) +{ + u8 r4 = (ec >> 4) & 0xf; + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 ii = tt; + bool ret = true; + + if (MEM_ERROR(ec)) { + + if (tt != TT_DATA || ll != LL_L1) + return false; + + switch (r4) { + case R4_DRD: + case R4_DWR: + pr_cont("Data/Tag parity error due to %s.\n", + (r4 == R4_DRD ? "load/hw prf" : "store")); + break; + case R4_EVICT: + pr_cont("Copyback parity error on a tag miss.\n"); + break; + case R4_SNOOP: + pr_cont("Tag parity error during snoop.\n"); + break; + default: + ret = false; + } + } else if (BUS_ERROR(ec)) { + + if ((ii != II_MEM && ii != II_IO) || ll != LL_LG) + return false; + + pr_cont("System read data error on a "); + + switch (r4) { + case R4_RD: + pr_cont("TLB reload.\n"); + break; + case R4_DWR: + pr_cont("store.\n"); + break; + case R4_DRD: + pr_cont("load.\n"); + break; + default: + ret = false; + } + } else { + ret = false; + } + + return ret; +} + +static void amd_decode_dc_mce(struct mce *m) +{ + u16 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Data Cache Error: "); + + /* TLB error signatures are the same across families */ + if (TLB_ERROR(ec)) { + u8 tt = (ec >> 2) & 0x3; + + if (tt == TT_DATA) { + pr_cont("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + return; + } else goto wrong_dc_mce; - } else + } + + if (!fam_ops->dc_mce(ec)) goto wrong_dc_mce; return; @@ -395,6 +474,30 @@ static int __init mce_amd_init(void) if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) return 0; + fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); + if (!fam_ops) + return -ENOMEM; + + switch (boot_cpu_data.x86) { + case 0xf: + fam_ops->dc_mce = k8_dc_mce; + break; + + case 0x10: + fam_ops->dc_mce = f10h_dc_mce; + break; + + case 0x14: + fam_ops->dc_mce = f14h_dc_mce; + break; + + default: + printk(KERN_WARNING "Huh? What family is that: %d?!\n", + boot_cpu_data.x86); + kfree(fam_ops); + return -EINVAL; + } + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; @@ -405,6 +508,7 @@ early_initcall(mce_amd_init); static void __exit mce_amd_exit(void) { atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); + kfree(fam_ops); } MODULE_DESCRIPTION("AMD MCE decoder"); diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 2712a906afdf..85985c225442 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -44,6 +44,39 @@ #define K8_NBSH_UECC BIT(13) #define K8_NBSH_ERR_SCRUBER BIT(8) +enum tt_ids { + TT_INSTR = 0, + TT_DATA, + TT_GEN, + TT_RESV, +}; + +enum ll_ids { + LL_RESV = 0, + LL_L1, + LL_L2, + LL_LG, +}; + +enum ii_ids { + II_MEM = 0, + II_RESV, + II_IO, + II_GEN, +}; + +enum rrrr_ids { + R4_GEN = 0, + R4_RD, + R4_WR, + R4_DRD, + R4_DWR, + R4_IRD, + R4_PREF, + R4_EVICT, + R4_SNOOP, +}; + extern const char *tt_msgs[]; extern const char *ll_msgs[]; extern const char *rrrr_msgs[]; @@ -63,6 +96,13 @@ struct err_regs { u32 nbeal; }; +/* + * per-family decoder ops + */ +struct amd_decoder_ops { + bool (*dc_mce)(u16); +}; + void amd_report_gart_errors(bool); void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32)); void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32)); -- cgit v1.2.1 From dd53bce4e8987f6848840d42bbeead5221eff308 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Aug 2010 19:05:49 +0200 Subject: EDAC, MCE: Adjust IC decoders to F14h Add support for IC MCEs for F14h CPUs. K8 and F10h are almost identical so use one function for both. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 118 +++++++++++++++++++++++++++++-------------------- drivers/edac/mce_amd.h | 1 + 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 33985aa61356..60d5d9f4dfee 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -219,61 +219,80 @@ wrong_dc_mce: pr_emerg(HW_ERR "Corrupted DC MCE info?\n"); } -static void amd_decode_ic_mce(struct mce *m) +static bool k8_ic_mce(u16 ec) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; + u8 ll = ec & 0x3; + u8 r4 = (ec >> 4) & 0xf; + bool ret = true; - pr_emerg(HW_ERR "Instruction Cache Error"); + if (!MEM_ERROR(ec)) + return false; - if (xec == 1 && TLB_ERROR(ec)) - pr_cont(": %s TLB multimatch.\n", LL_MSG(ec)); - else if (xec == 0) { - if (TLB_ERROR(ec)) - pr_cont(": %s TLB Parity error.\n", LL_MSG(ec)); - else if (BUS_ERROR(ec)) { - if (boot_cpu_data.x86 == 0xf && - (m->status & BIT(58))) - pr_cont(" during system linefill.\n"); - else - pr_cont(" during attempted NB data read.\n"); - } else if (MEM_ERROR(ec)) { - u8 ll = ec & 0x3; - u8 rrrr = (ec >> 4) & 0xf; + if (ll == 0x2) + pr_cont("during a linefill from L2.\n"); + else if (ll == 0x1) { + switch (r4) { + case R4_IRD: + pr_cont("Parity error during data load.\n"); + break; - if (ll == 0x2) - pr_cont(" during a linefill from L2.\n"); - else if (ll == 0x1) { - - switch (rrrr) { - case 0x5: - pr_cont(": Parity error during " - "data load.\n"); - break; - - case 0x7: - pr_cont(": Copyback Parity/Victim" - " error.\n"); - break; - - case 0x8: - pr_cont(": Tag Snoop error.\n"); - break; - - default: - goto wrong_ic_mce; - break; - } - } - } else - goto wrong_ic_mce; + case R4_EVICT: + pr_cont("Copyback Parity/Victim error.\n"); + break; + + case R4_SNOOP: + pr_cont("Tag Snoop error.\n"); + break; + + default: + ret = false; + break; + } } else - goto wrong_ic_mce; + ret = false; - return; + return ret; +} + +static bool f14h_ic_mce(u16 ec) +{ + u8 ll = ec & 0x3; + u8 tt = (ec >> 2) & 0x3; + u8 r4 = (ec >> 4) & 0xf; + bool ret = true; -wrong_ic_mce: - pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); + if (MEM_ERROR(ec)) { + if (tt != 0 || ll != 1) + ret = false; + + if (r4 == R4_IRD) + pr_cont("Data/tag array parity error for a tag hit.\n"); + else if (r4 == R4_SNOOP) + pr_cont("Tag error during snoop/victimization.\n"); + else + ret = false; + } + return ret; +} + +static void amd_decode_ic_mce(struct mce *m) +{ + u16 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0xf; + + pr_emerg(HW_ERR "Instruction Cache Error: "); + + if (TLB_ERROR(ec)) + pr_cont("%s TLB %s.\n", LL_MSG(ec), + (xec ? "multimatch" : "parity error")); + else if (BUS_ERROR(ec)) { + bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58))); + + pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); + } else if (fam_ops->ic_mce(ec)) + ; + else + pr_emerg(HW_ERR "Corrupted IC MCE info?\n"); } static void amd_decode_bu_mce(struct mce *m) @@ -481,14 +500,17 @@ static int __init mce_amd_init(void) switch (boot_cpu_data.x86) { case 0xf: fam_ops->dc_mce = k8_dc_mce; + fam_ops->ic_mce = k8_ic_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; + fam_ops->ic_mce = k8_ic_mce; break; case 0x14: fam_ops->dc_mce = f14h_dc_mce; + fam_ops->ic_mce = f14h_ic_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 85985c225442..dc81dba9364b 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -101,6 +101,7 @@ struct err_regs { */ struct amd_decoder_ops { bool (*dc_mce)(u16); + bool (*ic_mce)(u16); }; void amd_report_gart_errors(bool); -- cgit v1.2.1 From ded506232865e8e932bc21c87f48170d50db4d97 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 27 Aug 2010 17:03:34 +0200 Subject: EDAC, MCE: Warn about LS MCEs on F14h F14h CPUs do not generate LS MCEs so exit early and warn the user in case this path is ever hit that something else might be going haywire. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 60d5d9f4dfee..3c161672a84b 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -339,19 +339,27 @@ wrong_bu_mce: static void amd_decode_ls_mce(struct mce *m) { - u32 ec = m->status & 0xffff; - u32 xec = (m->status >> 16) & 0xf; + u16 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0xf; + + if (boot_cpu_data.x86 == 0x14) { + pr_emerg("You shouldn't be seeing an LS MCE on this cpu family," + " please report on LKML.\n"); + return; + } pr_emerg(HW_ERR "Load Store Error"); if (xec == 0x0) { - u8 rrrr = (ec >> 4) & 0xf; + u8 r4 = (ec >> 4) & 0xf; - if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4)) + if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR)) goto wrong_ls_mce; pr_cont(" during %s.\n", RRRR_MSG(ec)); - } + } else + goto wrong_ls_mce; + return; wrong_ls_mce: -- cgit v1.2.1 From 5ce88f6ea6bef929f59f9468413f922c9a486fa4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Aug 2010 18:28:08 +0200 Subject: EDAC, MCE: Complete NB MCE decoders Add support for decoding F14h BU MCEs and improve decoding of the remaining families. Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.h | 1 - drivers/edac/mce_amd.c | 210 ++++++++++++++++++++++++++++++++++------------ drivers/edac/mce_amd.h | 3 +- 3 files changed, 158 insertions(+), 56 deletions(-) diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 13e1d6f25bd1..044aee4f944d 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16]; extern const char *to_msgs[2]; extern const char *pp_msgs[4]; extern const char *ii_msgs[4]; -extern const char *ext_msgs[32]; extern const char *htlink_msgs[8]; #ifdef CONFIG_EDAC_DEBUG diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 3c161672a84b..d8d1c9de1ed6 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -5,6 +5,8 @@ static struct amd_decoder_ops *fam_ops; +static u8 nb_err_cpumask = 0xf; + static bool report_gart_errors; static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg); @@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs); const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" }; EXPORT_SYMBOL_GPL(ii_msgs); -/* - * Map the 4 or 5 (family-specific) bits of Extended Error code to the - * string table. - */ -const char *ext_msgs[] = { - "K8 ECC error", /* 0_0000b */ - "CRC error on link", /* 0_0001b */ - "Sync error packets on link", /* 0_0010b */ - "Master Abort during link operation", /* 0_0011b */ - "Target Abort during link operation", /* 0_0100b */ - "Invalid GART PTE entry during table walk", /* 0_0101b */ - "Unsupported atomic RMW command received", /* 0_0110b */ - "WDT error: NB transaction timeout", /* 0_0111b */ - "ECC/ChipKill ECC error", /* 0_1000b */ - "SVM DEV Error", /* 0_1001b */ - "Link Data error", /* 0_1010b */ - "Link/L3/Probe Filter Protocol error", /* 0_1011b */ - "NB Internal Arrays Parity error", /* 0_1100b */ - "DRAM Address/Control Parity error", /* 0_1101b */ - "Link Transmission error", /* 0_1110b */ - "GART/DEV Table Walk Data error" /* 0_1111b */ - "Res 0x100 error", /* 1_0000b */ - "Res 0x101 error", /* 1_0001b */ - "Res 0x102 error", /* 1_0010b */ - "Res 0x103 error", /* 1_0011b */ - "Res 0x104 error", /* 1_0100b */ - "Res 0x105 error", /* 1_0101b */ - "Res 0x106 error", /* 1_0110b */ - "Res 0x107 error", /* 1_0111b */ - "Res 0x108 error", /* 1_1000b */ - "Res 0x109 error", /* 1_1001b */ - "Res 0x10A error", /* 1_1010b */ - "Res 0x10B error", /* 1_1011b */ - "ECC error in L3 Cache Data", /* 1_1100b */ - "L3 Cache Tag error", /* 1_1101b */ - "L3 Cache LRU Parity error", /* 1_1110b */ - "Probe Filter error" /* 1_1111b */ +static const char *f10h_nb_mce_desc[] = { + "HT link data error", + "Protocol error (link, L3, probe filter, etc.)", + "Parity error in NB-internal arrays", + "Link Retry due to IO link transmission error", + "L3 ECC data cache error", + "ECC error in L3 cache tag", + "L3 LRU parity bits error", + "ECC Error in the Probe Filter directory" }; -EXPORT_SYMBOL_GPL(ext_msgs); static bool f10h_dc_mce(u16 ec) { @@ -366,19 +339,97 @@ wrong_ls_mce: pr_emerg(HW_ERR "Corrupted LS MCE info?\n"); } +static bool k8_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + + switch (xec) { + case 0x1: + pr_cont("CRC error detected on HT link.\n"); + break; + + case 0x5: + pr_cont("Invalid GART PTE entry during GART table walk.\n"); + break; + + case 0x6: + pr_cont("Unsupported atomic RMW received from an IO link.\n"); + break; + + case 0x0: + case 0x8: + pr_cont("DRAM ECC error detected on the NB.\n"); + break; + + case 0xd: + pr_cont("Parity error on the DRAM addr/ctl signals.\n"); + break; + + default: + ret = false; + break; + } + + return ret; +} + +static bool f10h_nb_mce(u16 ec, u8 xec) +{ + bool ret = true; + u8 offset = 0; + + if (k8_nb_mce(ec, xec)) + return true; + + switch(xec) { + case 0xa ... 0xc: + offset = 10; + break; + + case 0xe: + offset = 11; + break; + + case 0xf: + if (TLB_ERROR(ec)) + pr_cont("GART Table Walk data error.\n"); + else if (BUS_ERROR(ec)) + pr_cont("DMA Exclusion Vector Table Walk error.\n"); + else + ret = false; + + goto out; + break; + + case 0x1c ... 0x1f: + offset = 24; + break; + + default: + ret = false; + + goto out; + break; + } + + pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]); + +out: + return ret; +} + +static bool f14h_nb_mce(u16 ec, u8 xec) +{ + return false; +} + void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) { - u32 ec = m->status & 0xffff; + u8 xec = (m->status >> 16) & 0x1f; + u16 ec = m->status & 0xffff; u32 nbsh = (u32)(m->status >> 32); - u32 nbsl = (u32)m->status; - - /* - * GART TLB error reporting is disabled by default. Bail out early. - */ - if (TLB_ERROR(ec) && !report_gart_errors) - return; - pr_emerg(HW_ERR "Northbridge Error, node %d", node_id); + pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id); /* * F10h, revD can disable ErrCpu[3:0] so check that first and also the @@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg) if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model > 7)) { if (nbsh & K8_NBSH_ERR_CPU_VAL) - pr_cont(", core: %u\n", (u8)(nbsh & 0xf)); + pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask)); } else { - u8 assoc_cpus = nbsh & 0xf; + u8 assoc_cpus = nbsh & nb_err_cpumask; if (assoc_cpus > 0) pr_cont(", core: %d", fls(assoc_cpus) - 1); + } - pr_cont("\n"); + switch (xec) { + case 0x2: + pr_cont("Sync error (sync packets on HT link detected).\n"); + return; + + case 0x3: + pr_cont("HT Master abort.\n"); + return; + + case 0x4: + pr_cont("HT Target abort.\n"); + return; + + case 0x7: + pr_cont("NB Watchdog timeout.\n"); + return; + + case 0x9: + pr_cont("SVM DMA Exclusion Vector error.\n"); + return; + + default: + break; } - pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl)); + if (!fam_ops->nb_mce(ec, xec)) + goto wrong_nb_mce; + + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10) + if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder) + nb_bus_decoder(node_id, m, nbcfg); - if (BUS_ERROR(ec) && nb_bus_decoder) - nb_bus_decoder(node_id, m, nbcfg); + return; + +wrong_nb_mce: + pr_emerg(HW_ERR "Corrupted NB MCE info?\n"); } EXPORT_SYMBOL_GPL(amd_decode_nb_mce); @@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec) pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec); } +/* + * Filter out unwanted MCE signatures here. + */ +static bool amd_filter_mce(struct mce *m) +{ + u8 xec = (m->status >> 16) & 0x1f; + + /* + * NB GART TLB error reporting is disabled by default. + */ + if (m->bank == 4 && xec == 0x5 && !report_gart_errors) + return true; + + return false; +} + int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { struct mce *m = (struct mce *)data; int node, ecc; + if (amd_filter_mce(m)) + return NOTIFY_STOP; + pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank); pr_cont("%sorrected error, other errors lost: %s, " @@ -509,16 +609,20 @@ static int __init mce_amd_init(void) case 0xf: fam_ops->dc_mce = k8_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = k8_nb_mce; break; case 0x10: fam_ops->dc_mce = f10h_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = f10h_nb_mce; break; case 0x14: + nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; fam_ops->ic_mce = f14h_ic_mce; + fam_ops->nb_mce = f14h_nb_mce; break; default: diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index dc81dba9364b..0d0637debbad 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -7,7 +7,6 @@ #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) -#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)] #define LOW_SYNDROME(x) (((x) >> 15) & 0xff) #define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) @@ -83,7 +82,6 @@ extern const char *rrrr_msgs[]; extern const char *pp_msgs[]; extern const char *to_msgs[]; extern const char *ii_msgs[]; -extern const char *ext_msgs[]; /* * relevant NB regs @@ -102,6 +100,7 @@ struct err_regs { struct amd_decoder_ops { bool (*dc_mce)(u16); bool (*ic_mce)(u16); + bool (*nb_mce)(u16, u8); }; void amd_report_gart_errors(bool); -- cgit v1.2.1 From fe4ea2623bec3e595f8e77a8514307c389c096ae Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 31 Aug 2010 18:38:24 +0200 Subject: EDAC, MCE: Fix FR MCEs decoding Those are N/A on K8, so don't decode them there. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index d8d1c9de1ed6..83b7b5fcee7f 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -487,11 +487,17 @@ EXPORT_SYMBOL_GPL(amd_decode_nb_mce); static void amd_decode_fr_mce(struct mce *m) { + if (boot_cpu_data.x86 == 0xf) + goto wrong_fr_mce; + /* we have only one error signature so match all fields at once. */ - if ((m->status & 0xffff) == 0x0f0f) - pr_emerg(HW_ERR " FR Error: CPU Watchdog timer expire.\n"); - else - pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); + if ((m->status & 0xffff) == 0x0f0f) { + pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n"); + return; + } + +wrong_fr_mce: + pr_emerg(HW_ERR "Corrupted FR MCE info?\n"); } static inline void amd_decode_err_code(u16 ec) -- cgit v1.2.1 From 9530d608ef0e1f76b7fd82bb92645062292fc009 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 6 Sep 2010 15:05:45 +0200 Subject: EDAC, MCE: Enable MCE decoding on F14h Now that all decoders have been taught about F14h, models < 0x10 MCEs, enable decoding on this family of CPUs. Also, issue a short informational message upon boot that MCE decoding gets enabled. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 83b7b5fcee7f..f233c5f78302 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -598,13 +598,12 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { - /* - * We can decode MCEs for K8, F10h and F11h CPUs: - */ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 != 0xf && + boot_cpu_data.x86 != 0x10 && + (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf)) return 0; fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL); @@ -638,6 +637,8 @@ static int __init mce_amd_init(void) return -EINVAL; } + pr_info("MCE: In-kernel MCE decoding enabled.\n"); + atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); return 0; -- cgit v1.2.1 From f0157b3afd2ec6331245768a785487249a3c9734 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 Oct 2010 19:07:16 +0200 Subject: EDAC, MCE: Add support for F11h MCEs F11h has almost the same MCE signatures as K8 except DRAM ECC and MC5 bank errors. Reuse functionality from the other families. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index f233c5f78302..9fa61ee2d743 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -358,6 +358,9 @@ static bool k8_nb_mce(u16 ec, u8 xec) case 0x0: case 0x8: + if (boot_cpu_data.x86 == 0x11) + return false; + pr_cont("DRAM ECC error detected on the NB.\n"); break; @@ -487,7 +490,8 @@ EXPORT_SYMBOL_GPL(amd_decode_nb_mce); static void amd_decode_fr_mce(struct mce *m) { - if (boot_cpu_data.x86 == 0xf) + if (boot_cpu_data.x86 == 0xf || + boot_cpu_data.x86 == 0x11) goto wrong_fr_mce; /* we have only one error signature so match all fields at once. */ @@ -601,8 +605,7 @@ static int __init mce_amd_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 != 0xf && - boot_cpu_data.x86 != 0x10 && + if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) && (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf)) return 0; @@ -623,6 +626,12 @@ static int __init mce_amd_init(void) fam_ops->nb_mce = f10h_nb_mce; break; + case 0x11: + fam_ops->dc_mce = k8_dc_mce; + fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = f10h_nb_mce; + break; + case 0x14: nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; -- cgit v1.2.1 From 9be0bb1072e3544934e0ac20f184e50805aecf9c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Sep 2010 15:08:14 +0200 Subject: EDAC, MCE: Add F12h DC MCE decoder F12h DC MCE signatures are a subset of F10h's so reuse them. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 9fa61ee2d743..1f895dfb756a 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -74,16 +74,10 @@ static const char *f10h_nb_mce_desc[] = { "ECC Error in the Probe Filter directory" }; -static bool f10h_dc_mce(u16 ec) +static bool f12h_dc_mce(u16 ec) { - u8 r4 = (ec >> 4) & 0xf; bool ret = false; - if (r4 == R4_GEN) { - pr_cont("during data scrub.\n"); - return true; - } - if (MEM_ERROR(ec)) { u8 ll = ec & 0x3; ret = true; @@ -98,6 +92,18 @@ static bool f10h_dc_mce(u16 ec) return ret; } +static bool f10h_dc_mce(u16 ec) +{ + u8 r4 = (ec >> 4) & 0xf; + u8 ll = ec & 0x3; + + if (r4 == R4_GEN && ll == LL_L1) { + pr_cont("during data scrub.\n"); + return true; + } + return f12h_dc_mce(ec); +} + static bool k8_dc_mce(u16 ec) { if (BUS_ERROR(ec)) { @@ -632,6 +638,10 @@ static int __init mce_amd_init(void) fam_ops->nb_mce = f10h_nb_mce; break; + case 0x12: + fam_ops->dc_mce = f12h_dc_mce; + break; + case 0x14: nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; -- cgit v1.2.1 From e7281eb37da045abac5bd795d1169fc2e3eeea49 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Sep 2010 16:45:22 +0200 Subject: EDAC, MCE: Add F12h IC MCE decoder ... which is the same as for K8 and F10h. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 1f895dfb756a..fc3712f1b921 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -640,6 +640,7 @@ static int __init mce_amd_init(void) case 0x12: fam_ops->dc_mce = f12h_dc_mce; + fam_ops->ic_mce = k8_ic_mce; break; case 0x14: -- cgit v1.2.1 From cb9d5ecdff66197f65a6be8032ccc1ebf7199684 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 16 Sep 2010 17:36:12 +0200 Subject: EDAC, MCE: Add F12h NB MCE decoder F12h is completely covered by the generic path. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index fc3712f1b921..6501392389af 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -427,7 +427,7 @@ out: return ret; } -static bool f14h_nb_mce(u16 ec, u8 xec) +static bool nb_noop_mce(u16 ec, u8 xec) { return false; } @@ -641,13 +641,14 @@ static int __init mce_amd_init(void) case 0x12: fam_ops->dc_mce = f12h_dc_mce; fam_ops->ic_mce = k8_ic_mce; + fam_ops->nb_mce = nb_noop_mce; break; case 0x14: nb_err_cpumask = 0x3; fam_ops->dc_mce = f14h_dc_mce; fam_ops->ic_mce = f14h_ic_mce; - fam_ops->nb_mce = f14h_nb_mce; + fam_ops->nb_mce = nb_noop_mce; break; default: -- cgit v1.2.1 From fda7561f438aeddf074e2db0890e283195aa7779 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 22 Sep 2010 16:12:03 +0200 Subject: EDAC, MCE: Enable MCE decoding on F12h Turn on MCE decoding on F12h. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 6501392389af..7f74f0f318c8 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -611,7 +611,7 @@ static int __init mce_amd_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) && + if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) && (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf)) return 0; -- cgit v1.2.1 From cf1d2200dbc214c26a116c4d0c75b7cf27bb19b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 15 Oct 2010 15:20:18 +0200 Subject: EDAC, MCE: Add a BIT_64() macro Add a macro for 64-bit vectors to use when accessing MSR contents. Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h index 0d0637debbad..35f6e0e3b297 100644 --- a/drivers/edac/mce_amd.h +++ b/drivers/edac/mce_amd.h @@ -5,6 +5,8 @@ #include +#define BIT_64(n) (U64_C(1) << (n)) + #define ERROR_CODE(x) ((x) & 0xffff) #define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) -- cgit v1.2.1 From 525906bc898d712f21e5bfcfc85ab0e517e3d086 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 15 Oct 2010 15:27:02 +0200 Subject: EDAC, MCE: Fix shift warning on 32-bit Fix drivers/edac/mce_amd.c:262: warning: left shift count >= width of type on 32-bit builds. Reported-by: Randy Dunlap Signed-off-by: Borislav Petkov --- drivers/edac/mce_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 7f74f0f318c8..c0181093b490 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -265,7 +265,7 @@ static void amd_decode_ic_mce(struct mce *m) pr_cont("%s TLB %s.\n", LL_MSG(ec), (xec ? "multimatch" : "parity error")); else if (BUS_ERROR(ec)) { - bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT(58))); + bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58))); pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read")); } else if (fam_ops->ic_mce(ec)) -- cgit v1.2.1