summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMahesh Salgaonkar <mahesh@linux.vnet.ibm.com>2017-12-08 16:19:42 +0530
committerStewart Smith <stewart@linux.vnet.ibm.com>2017-12-11 19:30:46 -0600
commit10f0a09239ddfd4faf47d792f04d3124fb347f88 (patch)
treebbf6652baf6b769dc33498810c7c2f60e8537c1d
parent363f328fbc597a5996fc3b28e509c09f2869888f (diff)
downloadblackbird-skiboot-10f0a09239ddfd4faf47d792f04d3124fb347f88.tar.gz
blackbird-skiboot-10f0a09239ddfd4faf47d792f04d3124fb347f88.zip
opal/xscom: Add recovery for lost core wakeup scom failures.
Due to a hardware issue where core responding to scom was delayed due to thread reconfiguration, leaves the SCOM logic in a state where the subsequent scom to that core can get errors. This is affected for Core PC scom registers in the range of 20010A80-20010ABF The solution is if a xscom timeout occurs to one of Core PC scom registers in the range of 20010A80-20010ABF, a clearing scom write is done to 0x20010800 with data of '0x00000000' which will also get a timeout but clears the scom logic errors. After the clearing write is done the original scom operation can be retried. The scom timeout is reported as status 0x4 (Invalid address) in HMER[21-23]. Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> Reviewed-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
-rw-r--r--hw/xscom.c80
-rw-r--r--include/xscom.h8
2 files changed, 85 insertions, 3 deletions
diff --git a/hw/xscom.c b/hw/xscom.c
index d98f5ef7..de5a27ee 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -153,8 +153,69 @@ static void xscom_reset(uint32_t gcid, bool need_delay)
*/
}
+static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr)
+{
+ u64 hmer;
+ uint32_t base_xscom_addr;
+ uint32_t xscom_clear_reg = 0x20010800;
+
+ /* only in case of p9 */
+ if (proc_gen != proc_gen_p9)
+ return 0;
+
+ /*
+ * Due to a hardware issue where core responding to scom was delayed
+ * due to thread reconfiguration, leaves the scom logic in a state
+ * where the subsequent scom to that core can get errors. This is
+ * affected for Core PC scom registers in the range of
+ * 20010A80-20010ABF.
+ *
+ * The solution is if a xscom timeout occurs to one of Core PC scom
+ * registers in the range of 20010A80-20010ABF, a clearing scom
+ * write is done to 0x20010800 with data of '0x00000000' which will
+ * also get a timeout but clears the scom logic errors. After the
+ * clearing write is done the original scom operation can be retried.
+ *
+ * The scom timeout is reported as status 0x4 (Invalid address)
+ * in HMER[21-23].
+ */
+
+ base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK;
+ if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) &&
+ (base_xscom_addr <= XSCOM_CLEAR_RANGE_END)))
+ return 0;
+
+ /*
+ * Reset the XSCOM or next scom operation will fail.
+ * We also need a small delay before we go ahead with clearing write.
+ * We have observed that without a delay the clearing write has reported
+ * a wrong status.
+ */
+ xscom_reset(gcid, true);
+
+ /* Clear errors in HMER */
+ mtspr(SPR_HMER, HMER_CLR_MASK);
+
+ /* Write 0 to clear the xscom logic errors on target chip */
+ out_be64(xscom_addr(gcid, xscom_clear_reg), 0);
+ hmer = xscom_wait_done();
+
+ /*
+ * Above clearing xscom write will timeout and error out with
+ * invalid access as there is no register at that address. This
+ * xscom operation just helps to clear the xscom logic error.
+ *
+ * On failure, reset the XSCOM or we'll hang on the next access
+ */
+ if (hmer & SPR_HMER_XSCOM_FAIL)
+ xscom_reset(gcid, true);
+
+ return 1;
+}
+
static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
- bool is_write, int64_t retries)
+ bool is_write, int64_t retries,
+ int64_t *xscom_clear_retries)
{
unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
int64_t rc = OPAL_HARDWARE;
@@ -193,6 +254,15 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add
break;
case 4: /* Invalid address / address error */
rc = OPAL_XSCOM_ADDR_ERROR;
+ if (xscom_clear_error(gcid, pcb_addr)) {
+ /* return busy if retries still pending. */
+ if ((*xscom_clear_retries)--)
+ return OPAL_XSCOM_BUSY;
+
+ prlog(PR_DEBUG, "XSCOM: error recovery failed for "
+ "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr);
+
+ }
break;
case 5: /* Clock error */
rc = OPAL_XSCOM_CLOCK_ERROR;
@@ -255,6 +325,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
{
uint64_t hmer;
int64_t ret, retries;
+ int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
if (!xscom_gcid_ok(gcid)) {
prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
@@ -278,7 +349,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
return OPAL_SUCCESS;
/* Handle error and possibly eventually retry */
- ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries,
+ &xscom_clear_retries);
if (ret != OPAL_BUSY)
break;
}
@@ -305,6 +377,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
{
uint64_t hmer;
int64_t ret, retries = 0;
+ int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
if (!xscom_gcid_ok(gcid)) {
prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
@@ -328,7 +401,8 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
return OPAL_SUCCESS;
/* Handle error and possibly eventually retry */
- ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries,
+ &xscom_clear_retries);
if (ret != OPAL_BUSY)
break;
}
diff --git a/include/xscom.h b/include/xscom.h
index 5a5d0b9b..98532240 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -206,6 +206,14 @@
/* Max number of retries when XSCOM remains busy */
#define XSCOM_BUSY_MAX_RETRIES 3000
+/* Max number of retries for xscom clearing recovery. */
+#define XSCOM_CLEAR_MAX_RETRIES 10
+
+/* xscom clear address range/mask */
+#define XSCOM_CLEAR_RANGE_START 0x20010A00
+#define XSCOM_CLEAR_RANGE_END 0x20010ABF
+#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF
+
/* Retry count after which to reset XSCOM, if still busy */
#define XSCOM_BUSY_RESET_THRESHOLD 1000
OpenPOWER on IntegriCloud