diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2018-08-02 11:24:25 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-08-06 10:03:37 -0500 |
commit | d37ee6f5a97b469eec786d79df3612803f87b225 (patch) | |
tree | c1e997363dbbd3da4c758ac94c39f992a83eec40 /src/usr/diag | |
parent | c17bbad98d89aa354ac52511751c1327e11603bd (diff) | |
download | talos-hostboot-d37ee6f5a97b469eec786d79df3612803f87b225.tar.gz talos-hostboot-d37ee6f5a97b469eec786d79df3612803f87b225.zip |
PRD: getScom() retry for HBRT channel failures
Change-Id: If643b696fc834685da1fc8124b02d90507d5de89
CQ: SW439917
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/63795
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/63885
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag')
-rw-r--r-- | src/usr/diag/prdf/plat/prdfPlatServices.C | 56 |
1 files changed, 43 insertions, 13 deletions
diff --git a/src/usr/diag/prdf/plat/prdfPlatServices.C b/src/usr/diag/prdf/plat/prdfPlatServices.C index c4ddc4a07..ad742e30d 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices.C @@ -51,6 +51,7 @@ #include <devicefw/userif.H> #include <iipMopRegisterAccess.h> #include <ibscomreasoncodes.H> +#include <scom/scomreasoncodes.H> #include <p9_proc_gettracearray.H> #include <fapi2_spd_access.H> #include <p9c_mss_maint_cmds.H> @@ -99,27 +100,56 @@ bool isSpConfigFsp() uint32_t getScom(TARGETING::TargetHandle_t i_target, BitString& io_bs, uint64_t i_address) { - errlHndl_t errl = NULL; + errlHndl_t errl = nullptr; uint32_t rc = SUCCESS; size_t bsize = (io_bs.getBitLen()+7)/8; CPU_WORD* buffer = io_bs.getBufAddr(); errl = deviceRead(i_target, buffer, bsize, DEVICE_SCOM_ADDRESS(i_address)); - if(( NULL != errl ) && ( IBSCOM::IBSCOM_BUS_FAILURE == errl->reasonCode() )) + if ( nullptr != errl ) { - PRDF_SET_ERRL_SEV(errl, ERRL_SEV_INFORMATIONAL); - PRDF_COMMIT_ERRL(errl, ERRL_ACTION_HIDDEN); - PRDF_INF( "Register access failed with reason code IBSCOM_BUS_FAILURE." - " Trying again, Target HUID:0x%08X Register 0x%016X Op:%u", - PlatServices::getHuid( i_target), i_address, - MopRegisterAccess::READ ); - - errl = deviceRead(i_target, buffer, bsize, - DEVICE_SCOM_ADDRESS(i_address)); + bool doRetry = false; + + #ifdef __HOSTBOOT_RUNTIME + + // We don't have a good mechanism at this time to determine if the SCOM + // failed because of a channel failure. So we will just assume any SCOM + // error on the Centaur means there is a channel failure and that we + // will need to retry. + if ( SCOM::SCOM_RUNTIME_HYP_ERR == errl->reasonCode() && + ( (TYPE_MEMBUF == getTargetType(i_target)) || + (TYPE_MBA == getTargetType(i_target)) ) ) + { + doRetry = true; + } + + #else + + // An inband SCOM failure likely means the memory channel has failed. + // Hostboot will have switched over to FSI SCOMs. So retry. + if ( IBSCOM::IBSCOM_BUS_FAILURE == errl->reasonCode() ) + { + doRetry = true; + } + + #endif + + if ( doRetry ) + { + PRDF_INF( "deviceRead(0x%08x,0x%016x) failed with reason code " + "0x%04x, retrying...", PlatServices::getHuid(i_target), + i_address, errl->reasonCode() ); + + PRDF_SET_ERRL_SEV( errl, ERRL_SEV_INFORMATIONAL ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_HIDDEN ); + + errl = deviceRead( i_target, buffer, bsize, + DEVICE_SCOM_ADDRESS(i_address) ); + } } - if( NULL != errl ) + if ( nullptr != errl ) { PRDF_ERR( "getScom() failed on i_target=0x%08x i_address=0x%016llx", getHuid(i_target), i_address ); @@ -137,7 +167,7 @@ uint32_t getScom(TARGETING::TargetHandle_t i_target, BitString& io_bs, else { delete errl; - errl = NULL; + errl = nullptr; } } |