diff options
author | Ananth N Mavinakayanahalli <ananth@in.ibm.com> | 2014-07-24 11:46:58 +0530 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2014-07-25 14:10:51 +1000 |
commit | cad0d16e236bcbf457406023d5df4da8d55b778f (patch) | |
tree | b63b5f3e36fba1a0bc4be4f8daf308b7b113b2c8 | |
parent | 99608f2074d4c8877c8445e20a1275dc1257079c (diff) | |
download | talos-skiboot-cad0d16e236bcbf457406023d5df4da8d55b778f.tar.gz talos-skiboot-cad0d16e236bcbf457406023d5df4da8d55b778f.zip |
FSP: Rework the R/R state machine
a. Do not trigger PSI link down on DISR's RR bit set.
b. Do trigger HIR if DISR's Unit Check bit is set.
c. On fsp_mbx_err, trigger a HIR (very rare occurance).
d. Use fsp_start_rr() when the DISR's RR bit is seen so all mbox activity is
stopped when the FSP indicates an RR start.
We do not bring the link down voluntarily on DISR's RR begin, pending a
PSI interrupt, which actually triggers the link down. Per Dean Sanner, this
is the right protocol to follow.
The assumption is that a DISR RR bit set would cause a PSI interrupt 'soon'.
One TODO is to figure out if this interrupt never arrives.
The PSI interrupt does come through albeit after a short while:
SURV: [ 1dc662ef7f] Sending the hearbeat command to FSP
SURV: Received heartbeat acknowledge from FSP
FSP #0: DISR stat change = 0x000000a1
FSP #0: FSP in Reset. Waiting for PSI interrupt
FSPCON: Closed consoles on account of FSP reset/reload
SURV: Disabling surveillance
FSP: Closing NVRAM on account of FSP Reset
FSP #0: HDES stat change = 0xffffffff
PSI[0x000]: PSI mgmnt interrupt CR=0xfcf0d100c0000000
PSI: PSI Reported Error
PSI: PSI Link Inactive Transition
PSI: SEMR set to fff0fff00000
PSI[0x000]: Disabling link!
PSI: PSIHB_CR (error bits) set to 68f0510040000000
PSI: starting link polling
PSI: Spurious interrupt, attempting clear
PSI[0x001]: Poll CR=0x00f0100000000000
PSI[0x000]: Poll CR=0x68f0100040000000
PSI[0x001]: Poll CR=0x00f0100000000000
PSI[0x000]: Poll CR=0x68f0100040000000
...
And we recover:
PSI[0x000]: Poll CR=0x68f0100040000000
PSI[0x001]: Poll CR=0xccf0300000000000
PSI[0x001]: Found active link!
PSI: stopping link polling
FSP: Connected to FSP-A
FSP #0: DISR stat change = 0x000000a9
FSP #0: DISR stat change = 0x00000281
FSP #0: Detected R&R complete, acking
FSP #0: HDES stat change = 0x00000000
FSP #0: DISR stat change = 0x00000081
FSP: FSP assuming new role
FSP: SP says Reset/Reload complete
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r-- | hw/fsp/fsp.c | 55 | ||||
-rw-r--r-- | include/fsp.h | 1 |
2 files changed, 43 insertions, 13 deletions
diff --git a/hw/fsp/fsp.c b/hw/fsp/fsp.c index f5dcbf34..ffa408b9 100644 --- a/hw/fsp/fsp.c +++ b/hw/fsp/fsp.c @@ -372,9 +372,9 @@ static bool fsp_in_hir(struct fsp *fsp) static bool fsp_in_reset(struct fsp *fsp) { switch (fsp->state) { - case fsp_mbx_hir_seq_done: /* Will be reset soon */ + case fsp_mbx_hir_seq_done: /* Link pulled down */ case fsp_mbx_err: /* Will be reset soon */ - case fsp_mbx_rr: /* Already in reset */ + case fsp_mbx_rr: /* Mbx activity stopped pending reset */ return true; default: return false; @@ -506,6 +506,10 @@ void fsp_trigger_reset(void) unlock(&fsp_lock); } +/* + * Called when we trigger a HIR or when the FSP tells us via the DISR's + * RR bit that one is impending. We should therefore stop all mbox activity. + */ static void fsp_start_rr(struct fsp *fsp) { struct fsp_iopath *iop; @@ -533,6 +537,12 @@ static void fsp_start_rr(struct fsp *fsp) unlock(&fsp_lock); fsp_notify_rr_state(FSP_RESET_START); lock(&fsp_lock); + + /* + * Unlike earlier, we don't trigger the PSI link polling + * from this point. We wait for the PSI interrupt to tell + * us the FSP is really down and then start the polling there. + */ } static void fsp_trace_event(struct fsp *fsp, u32 evt, @@ -604,12 +614,25 @@ static void fsp_handle_errors(struct fsp *fsp) disr_last_print = disr; } + /* On a deferred mbox error, trigger a HIR + * Note: We may never get here since the link inactive case is handled + * above and the other case is when the iop->psi is NULL, which is + * quite rare. + */ + if (fsp->state == fsp_mbx_err) { + prerror("FSP #%d: Triggering HIR on mbx_err\n", + fsp->index); + fsp_trigger_reset(); + return; + } + /* - * We detect FSP_IN_RR in DSISR or we have a deferred mbox - * error, we trigger an R&R after a bit of housekeeping to - * limit the chance of a stray interrupt + * If we get here as part of normal flow, the FSP is telling + * us that there will be an impending R&R, so we stop all mbox + * activity. The actual link down trigger is via a PSI + * interrupt that may arrive in due course. */ - if ((disr & FSP_DISR_FSP_IN_RR) || (fsp->state == fsp_mbx_err)) { + if (disr & FSP_DISR_FSP_IN_RR) { /* * If we get here with DEBUG_IN_PROGRESS also set, the * FSP is in debug and we should *not* reset it now @@ -624,9 +647,19 @@ static void fsp_handle_errors(struct fsp *fsp) if (fsp->state == fsp_mbx_rr) return; + printf("FSP #%d: FSP in Reset. Waiting for PSI interrupt\n", + fsp->index); + fsp_start_rr(fsp); + } + + /* + * However, if the Unit Check is also set, the FSP is asking us + * to trigger a HIR so it can try to recover via the DRCR route. + */ + if (disr & FSP_DISR_FSP_UNIT_CHECK) { fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0); - printf("FSP #%d: FSP in reset or delayed error, starting R&R\n", + printf("FSP #%d: DISR's unit check set, starting HIR\n", fsp->index); /* Clear all interrupt conditions */ @@ -635,11 +668,7 @@ static void fsp_handle_errors(struct fsp *fsp) /* Make sure this happened */ fsp_rreg(fsp, FSP_HDIR_REG); - /* Bring the PSI link down */ - psi_disable_link(psi); - - /* Start R&R process */ - fsp_start_rr(fsp); + fsp_trigger_reset(); return; } @@ -1396,7 +1425,7 @@ static void __fsp_poll(bool interrupt) again: if (fsp->active_iopath < 0) { /* That should never happen */ - if (interrupt) + if (interrupt && (fsp->state != fsp_mbx_rr)) prerror("FSP: Interrupt with no working IO path\n"); return; } diff --git a/include/fsp.h b/include/fsp.h index ec25384d..b48f4e1c 100644 --- a/include/fsp.h +++ b/include/fsp.h @@ -104,6 +104,7 @@ #define FSP_DISR_CLR_REG 0x0C /* Bit masks for DISR */ +#define FSP_DISR_FSP_UNIT_CHECK PPC_BIT32(16) #define FSP_DISR_FSP_RR_COMPLETE PPC_BIT32(22) #define FSP_DISR_RUNTIME_STATE_SYNCD PPC_BIT32(24) #define FSP_DISR_DBG_IN_PROGRESS PPC_BIT32(25) |