summaryrefslogtreecommitdiffstats
path: root/core
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2018-03-01 17:12:20 +1000
committerStewart Smith <stewart@linux.vnet.ibm.com>2018-03-01 20:36:53 -0600
commit56a85b41d23147e7dbe6d78d5a46d13910bc8495 (patch)
treec45de89a8fb6a1be5872a635cc126f3c8bc80a4d /core
parent8ea3ac76137be3f02d4131b36a66f6917190e384 (diff)
downloadtalos-skiboot-56a85b41d23147e7dbe6d78d5a46d13910bc8495.tar.gz
talos-skiboot-56a85b41d23147e7dbe6d78d5a46d13910bc8495.zip
core/hmi: report processor recovery reason from core FIR bits on P9
When an error is encountered that causes processor recovery, HMI is generated if the recovery was successful. The reason is recorded in the core FIR, which gets copied into the WOF. In this case dump the WOF register and an error string into the OPAL msglog. A broken init setting led to HMIs reported in Linux as: [ 3.591547] Harmless Hypervisor Maintenance interrupt [Recovered] [ 3.591648] Error detail: Processor Recovery done [ 3.591714] HMER: 2040000000000000 This patch would have been useful because it tells us exactly that the problem is in the d-side ERAT: [ 414.489690798,7] HMI: Received HMI interrupt: HMER = 0x2040000000000000 [ 414.489693339,7] HMI: [Loc: UOPWR.0000000-Node0-Proc0]: P:0 C:1 T:1: Processor recovery occurred. [ 414.489699837,7] HMI: Core WOF = 0x0000000410000000 recovered error: [ 414.489701543,7] HMI: LSU - SRAM (DCACHE parity, etc) [ 414.489702341,7] HMI: LSU - ERAT multi hit In future it will be good to unify this reporting, so Linux could print something more useful. Until then, this gives some good data. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Diffstat (limited to 'core')
-rw-r--r--core/hmi.c62
1 files changed, 59 insertions, 3 deletions
diff --git a/core/hmi.c b/core/hmi.c
index 17983a33..846d2b92 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -163,6 +163,9 @@
#define P8_CORE_FIR 0x10013100
#define P9_CORE_FIR 0x20010A40
+/* And core WOF (Whose On First) */
+#define P9_CORE_WOF 0x20010A48
+
/* xscom addresses for pMisc Receive Malfunction Alert Register */
#define P8_MALFUNC_ALERT 0x02020011
#define P9_MALFUNC_ALERT 0x00090022
@@ -215,6 +218,28 @@ static const struct core_xstop_bit_info {
{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
};
+static const struct core_recoverable_bit_info {
+ uint8_t bit; /* CORE FIR bit number */
+ const char *reason;
+} recoverable_bits[] = {
+ { 0, "IFU - SRAM (ICACHE parity, etc)" },
+ { 2, "IFU - RegFile" },
+ { 4, "IFU - Logic" },
+ { 9, "ISU - RegFile" },
+ { 11, "ISU - Logic" },
+ { 13, "ISU - Recoverable due to not in MT window" },
+ { 24, "VSU - Logic" },
+ { 27, "VSU - DFU logic" },
+ { 29, "LSU - SRAM (DCACHE parity, etc)" },
+ { 31, "LSU - RegFile" },
+ /* The following 3 bits may be set by SRAM errors. */
+ { 33, "LSU - TLB multi hit" },
+ { 34, "LSU - SLB multi hit" },
+ { 35, "LSU - ERAT multi hit" },
+ { 37, "LSU - Logic" },
+ { 39, "LSU - Recoverable due to not in MT window" },
+};
+
static const struct nx_xstop_bit_info {
uint8_t bit; /* NX FIR bit number */
enum OpalHMI_NestAccelXstopReason reason;
@@ -313,6 +338,21 @@ static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
return rc;
}
+static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
+{
+ int rc;
+
+ switch (proc_gen) {
+ case proc_gen_p9:
+ rc = xscom_read(chip_id,
+ XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
+ break;
+ default:
+ rc = OPAL_HARDWARE;
+ }
+ return rc;
+}
+
static bool decode_core_fir(struct cpu_thread *cpu,
struct OpalHMIEvent *hmi_evt)
{
@@ -1069,6 +1109,7 @@ static void hmi_print_debug(const uint8_t *msg)
int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
{
+ struct cpu_thread *cpu = this_cpu();
int recover = 1;
uint64_t tfmr;
@@ -1084,18 +1125,33 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
* looking at TFMR register. TFMR will tell us correct state of
* TB register.
*/
- this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+ cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer);
if (hmi_evt)
hmi_evt->hmer = hmer;
if (hmer & SPR_HMER_PROC_RECV_DONE) {
+ uint32_t chip_id = pir_to_chip_id(cpu->pir);
+ uint32_t core_id = pir_to_core_id(cpu->pir);
+ uint64_t core_wof;
+
+ hmi_print_debug("Processor recovery occurred.");
+ if (!read_core_wof(chip_id, core_id, &core_wof)) {
+ int i;
+
+ prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
+ for (i = 0; i < ARRAY_SIZE(recoverable_bits); i++) {
+ if (core_wof & PPC_BIT(recoverable_bits[i].bit))
+ prlog(PR_DEBUG, "%s\n",
+ recoverable_bits[i].reason);
+ }
+ }
+
hmer &= ~SPR_HMER_PROC_RECV_DONE;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
queue_hmi_event(hmi_evt, recover);
}
- hmi_print_debug("Processor recovery Done.");
}
if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
@@ -1180,7 +1236,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
mtspr(SPR_HMER, hmer);
hmi_exit();
/* Set the TB state looking at TFMR register before we head out. */
- this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+ cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
unlock(&hmi_lock);
return recover;
}
OpenPOWER on IntegriCloud