diff options
author | Caleb Palmer <cnpalmer@us.ibm.com> | 2018-04-23 08:55:42 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-05-18 10:39:40 -0400 |
commit | 41a25f11016a7847565e323f42615e460354afa4 (patch) | |
tree | c83727db6669e6f9a52bec739db452a90bcf6165 /src/usr | |
parent | 1b04e458595a9e9c5c04dd322f90d4c44129e111 (diff) | |
download | talos-hostboot-41a25f11016a7847565e323f42615e460354afa4.tar.gz talos-hostboot-41a25f11016a7847565e323f42615e460354afa4.zip |
PRD: Resume maint cmd support for MBA
Change-Id: I77b56983eba633104f8b15d6b608cb490c5be48d
RTC: 191647
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57918
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59013
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr')
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C | 11 | ||||
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H | 12 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C | 140 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H | 9 | ||||
-rw-r--r-- | src/usr/diag/prdf/plat/prdfPlatServices_rt.C | 158 |
5 files changed, 309 insertions, 21 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C index 5893d6dc9..de44ed87e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C @@ -103,6 +103,17 @@ MemAddr MemAddr::fromMaintAddr<TYPE_MBA>( uint64_t i_addr ) return MemAddr( MemRank(mrnk, srnk), bnk, row, col ); } +template<> +uint64_t MemAddr::toMaintAddr<TYPE_MBA>() const +{ + return ( ((uint64_t) iv_rnk.getMaster() << 60) | + ((uint64_t) iv_rnk.getSlave() << 57) | + ((uint64_t) iv_bnk << 53) | + ((uint64_t)(iv_row & 0x1ffff) << 36) | // r16-r0 + ((uint64_t) iv_col << 24) | + ((uint64_t)(iv_row & 0x20000) << 13) ); // r17 +} + //------------------------------------------------------------------------------ // Address Accessor Functions //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H index 4b1fc07fb..b0b86af04 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H @@ -87,12 +87,22 @@ class MemAddr static MemAddr fromReadAddr( uint64_t i_addr ); /** - * @brief Creates a MemAddr from a maintenance address. + * @brief Creates a MemAddr from the current maintenance address. * @param i_addr 64-bit address. */ template<TARGETING::TYPE T> static MemAddr fromMaintAddr( uint64_t i_addr ); + /** + * @brief Converts internal data structure to a maintenance address. + * @return A uint64_t version of the address. + * @note Does not include error type. This is because in most cases we + * will use this function to write out to hardware and in doing so + * we will want to clear the status bits anyway. + */ + template<TARGETING::TYPE T> + uint64_t toMaintAddr() const; + /** @return This address's rank. */ const MemRank& getRank() const { return iv_rnk; } diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C index 5b7e72c03..43fd84545 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C @@ -386,6 +386,146 @@ uint32_t checkEccFirs<TYPE_MBA>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ template<> +uint32_t conditionallyClearEccCounters<TYPE_MBA>( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[conditionallyClearEccCounters] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // Check for maintenance ECC errors. + uint32_t eccAttns = 0; + o_rc = checkEccFirs<TYPE_MBA>( i_chip, eccAttns ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "checkEccFirs<TYPE_MBA>(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); + uint8_t mbaPos = i_chip->getPos(); + + const char * ec0Reg_str = (0 == mbaPos) ? "MBA0_MBSEC0" : "MBA1_MBSEC0"; + SCAN_COMM_REGISTER_CLASS * ec0Reg = membChip->getRegister( ec0Reg_str ); + o_rc = ec0Reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", ec0Reg_str ); + break; + } + + const char * mbstr_str = (0 == mbaPos) ? "MBSTR_0" : "MBSTR_1"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( mbstr_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", mbstr_str ); + break; + } + + bool updateEc0 = false; + bool clearSymCntrs = false; + + if ( eccAttns & MAINT_SOFT_NCE_ETE ) + { + // Clear Soft CE total count. + ec0Reg->SetBitFieldJustified( 0, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(55) ) clearSymCntrs = true; + } + + if ( eccAttns & MAINT_INT_NCE_ETE ) + { + // Clear Intermittent CE total count. + ec0Reg->SetBitFieldJustified( 12, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(56) ) clearSymCntrs = true; + } + + if ( eccAttns & MAINT_HARD_NCE_ETE ) + { + // Clear the hard CE total count. + ec0Reg->SetBitFieldJustified( 24, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(57) ) clearSymCntrs = true; + } + + if ( updateEc0 ) + { + o_rc = ec0Reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", ec0Reg_str ); + break; + } + } + + if ( clearSymCntrs ) + { + // Clear all of the per symbol counters. Note that there are a total + // of 9 MBSSYMECx registers (MBSSYMEC0-MBSSYMEC8) per MBA. + for ( uint8_t i = 0; i < 9; i++ ) + { + char reg_str[20]; + snprintf( reg_str, 20, "MBA%d_MBSSYMEC%d", mbaPos, i ); + + SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister(reg_str); + + reg->clearAllBits(); + + o_rc = reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); + break; + } + } + if ( SUCCESS != o_rc ) break; + } + + if ( eccAttns & MAINT_RCE_ETE ) + { + // Clear only the RCE total count. + const char * ec1Reg_str = + (0 == mbaPos) ? "MBA0_MBSEC1" : "MBA1_MBSEC1"; + SCAN_COMM_REGISTER_CLASS * ec1Reg = + membChip->getRegister( ec1Reg_str ); + + o_rc = ec1Reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", ec1Reg_str ); + break; + } + + ec1Reg->SetBitFieldJustified( 0, 12, 0 ); + + o_rc = ec1Reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", ec1Reg_str ); + break; + } + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<> uint32_t setBgScrubThresholds<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank ) { diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H index 72b52cbcc..bfa4e6a87 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H @@ -66,6 +66,15 @@ template<TARGETING::TYPE T> uint32_t clearEccCounters( ExtensibleChip * i_chip ); /** + * @brief Calls checkEccFirs() and clears the maintenance ECC counters based on + * the active error types. + * @param i_chip MBA. + * @return Non-SUCCESS on SCOM failures, SUCCESS otherwise. + */ +template<TARGETING::TYPE T> +uint32_t conditionallyClearEccCounters( ExtensibleChip * i_chip ); + +/** * @brief Clears the maintenance ECC FIRs. * @param i_chip MBA, MCA, or MCBIST. * @return Non-SUCCESS on SCOM failures, SUCCESS otherwise. diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C index 52ca3ef46..d0ad9b2f6 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C @@ -34,6 +34,7 @@ // Framework includes #include <prdfErrlUtil.H> #include <prdfTrace.H> +#include <prdfRegisterCache.H> // Platform includes #include <prdfCenMbaDataBundle.H> @@ -215,29 +216,28 @@ uint32_t stopBgScrub<TYPE_MBA>( ExtensibleChip * i_chip ) PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - uint32_t rc = SUCCESS; + uint32_t o_rc = SUCCESS; + + fapi2::Target<fapi2::TARGET_TYPE_MBA> fapiTrgt ( i_chip->getTrgt() ); + errlHndl_t errl = nullptr; - PRDF_ERR( PRDF_FUNC "function not implemented yet" ); -/* TODO RTC 157888 // It is safe to create a dummy command object because runtime commands do // not store anything for cleanupCmd() and the stopCmd() function is generic // for all command types. Also, since we are only stopping the command, all // of the parameters for the command object are junk except for the target. - ecmdDataBufferBase i_startAddr, i_endAddr; - mss_TimeBaseScrub cmd { getFapiTarget(i_trgt), i_startAddr, i_endAddr, + fapi2::buffer<uint64_t> startAddr, endAddr; + mss_TimeBaseScrub cmd { fapiTrgt, startAddr, endAddr, mss_MaintCmd::FAST_MAX_BW_IMPACT, 0, false }; - - errlHndl_t errl = fapi::fapiRcToErrl( cmd.stopCmd() ); + FAPI_INVOKE_HWP( errl, cmd.stopCmd ); if ( nullptr != errl ) { PRDF_ERR( PRDF_FUNC "mss_TimeBaseScrub::stop(0x%08x) failed", - getHuid(i_trgt) ); + i_chip->getHuid() ); PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); - rc = FAIL; + o_rc = FAIL; } -*/ - return rc; + return o_rc; #undef PRDF_FUNC } @@ -262,6 +262,14 @@ uint32_t __resumeScrub<TYPE_MBA>( ExtensibleChip * i_chip, uint32_t o_rc = SUCCESS; + // Make sure there is a command complete attention when the command stops. + i_stopCond |= mss_MaintCmd::ENABLE_CMD_COMPLETE_ATTENTION; + + // Make sure the command stops immediately on error or on the end address if + // there are no errors. + i_stopCond |= mss_MaintCmd::STOP_IMMEDIATE; + i_stopCond |= mss_MaintCmd::STOP_ON_END_ADDRESS; + if ( getMbaDataBundle(i_chip)->iv_scrubResumeCounter.atTh() ) { // We have resumed scrubbing on this rank too many times. We still want @@ -277,11 +285,108 @@ uint32_t __resumeScrub<TYPE_MBA>( ExtensibleChip * i_chip, i_stopCond &= ~mss_MaintCmd::STOP_ON_UE; } + fapi2::Target<fapi2::TARGET_TYPE_MBA> fapiTrgt ( i_chip->getTrgt() ); + errlHndl_t errl = nullptr; + do { - // TODO: Clear ECC counters/FIRs. Increment the current address. Clear - // FIRs again. Start the command from the current address to the - // end of the rank. + // Manually clear the CE counters based on the error type and clear the + // maintenance FIRs. Note that we only want to clear counters that are + // at attention to allow the other CE types the opportunity to reach + // threshold, if possible. + o_rc = conditionallyClearEccCounters<TYPE_MBA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "conditionallyClearEccCounters(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearEccFirs<TYPE_MBA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearEccFirs(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearCmdCompleteAttn<TYPE_MBA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearCmdCompleteAttn(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + // Increment the current maintenance address. + mss_IncrementAddress incCmd { fapiTrgt }; + FAPI_INVOKE_HWP( errl, incCmd.setupAndExecuteCmd ); + if ( nullptr != errl ) + { + PRDF_ERR( PRDF_FUNC "mss_IncrementAddress setupAndExecuteCmd() on " + "0x%08x failed", i_chip->getHuid() ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Clear the maintenance FIRs again. This time do not clear the CE + // counters. + o_rc = clearEccFirs<TYPE_MBA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearEccFirs(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearCmdCompleteAttn<TYPE_MBA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearCmdCompleteAttn(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + // The address register has been updated so we need to clear our cache + // to ensure we can do a new read. + SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MBMACA" ); + RegDataCache::getCachedRegisters().flush( i_chip, reg ); + + // Read the new start address from hardware. + MemAddr addr; + o_rc = getMemMaintAddr<TYPE_MBA>( i_chip, addr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + i_chip->getHuid() ); + break; + } + fapi2::buffer<uint64_t> saddr = addr.toMaintAddr<TYPE_MBA>(); + + // Get the end address of the current rank. + fapi2::buffer<uint64_t> eaddr, junk; + MemRank rank = addr.getRank(); + o_rc = getMemAddrRange<TYPE_MBA>( i_chip, rank, junk, eaddr, + i_rangeType ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemAddrRange(0x%08x,0x%2x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + + // Resume the scrub command. + mss_TimeBaseScrub scrubCmd { fapiTrgt, saddr, eaddr, i_cmdSpeed, + i_stopCond, false }; + FAPI_INVOKE_HWP( errl, scrubCmd.setupAndExecuteCmd ); + if ( nullptr != errl ) + { + PRDF_ERR( PRDF_FUNC "setupAndExecuteCmd() on 0x%08x,0x%02x failed", + i_chip->getHuid(), rank.getKey() ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; break; + } // Resume successful. So increment the resume counter. getMbaDataBundle(i_chip)->iv_scrubResumeCounter.inc(); @@ -301,10 +406,22 @@ uint32_t resumeBgScrub<TYPE_MBA>( ExtensibleChip * i_chip ) PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - /* TODO: + uint32_t stopCond = mss_MaintCmd::STOP_ON_HARD_NCE_ETE | + mss_MaintCmd::STOP_ON_INT_NCE_ETE | + mss_MaintCmd::STOP_ON_SOFT_NCE_ETE | + mss_MaintCmd::STOP_ON_RETRY_CE_ETE | + mss_MaintCmd::STOP_ON_MPE | + mss_MaintCmd::STOP_ON_UE; + + mss_MaintCmd::TimeBaseSpeed cmdSpeed = enableFastBgScrub() + ? mss_MaintCmd::FAST_MED_BW_IMPACT + : mss_MaintCmd::BG_SCRUB; + + // Because of the Centaur workarounds, we have to limit the number of times + // a command has been resumed on a rank. Therefore, we must always resume + // the command to the end of the current slave rank. + return __resumeScrub<TYPE_MBA>( i_chip, SLAVE_RANK, stopCond, cmdSpeed ); - */ - return SUCCESS; } //------------------------------------------------------------------------------ @@ -317,10 +434,11 @@ uint32_t resumeTdScrub<TYPE_MBA>( ExtensibleChip * i_chip, PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - /* TODO: + mss_MaintCmd::TimeBaseSpeed cmdSpeed = enableFastBgScrub() + ? mss_MaintCmd::FAST_MAX_BW_IMPACT + : mss_MaintCmd::FAST_MIN_BW_IMPACT; + return __resumeScrub<TYPE_MBA>( i_chip, i_rangeType, i_stopCond, cmdSpeed ); - */ - return SUCCESS; } //############################################################################## |