From 41a25f11016a7847565e323f42615e460354afa4 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Mon, 23 Apr 2018 08:55:42 -0500 Subject: PRD: Resume maint cmd support for MBA Change-Id: I77b56983eba633104f8b15d6b608cb490c5be48d RTC: 191647 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57918 Tested-by: Jenkins Server Reviewed-by: Brian J. Stegmiller Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Matt Derksen Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59013 Tested-by: Jenkins OP Build CI Tested-by: FSP CI Jenkins Tested-by: Jenkins OP HW --- src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C | 11 ++ src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H | 12 +- src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C | 140 ++++++++++++++++++ src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H | 9 ++ src/usr/diag/prdf/plat/prdfPlatServices_rt.C | 158 ++++++++++++++++++--- 5 files changed, 309 insertions(+), 21 deletions(-) (limited to 'src/usr/diag/prdf') diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C index 5893d6dc9..de44ed87e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.C @@ -103,6 +103,17 @@ MemAddr MemAddr::fromMaintAddr( uint64_t i_addr ) return MemAddr( MemRank(mrnk, srnk), bnk, row, col ); } +template<> +uint64_t MemAddr::toMaintAddr() const +{ + return ( ((uint64_t) iv_rnk.getMaster() << 60) | + ((uint64_t) iv_rnk.getSlave() << 57) | + ((uint64_t) iv_bnk << 53) | + ((uint64_t)(iv_row & 0x1ffff) << 36) | // r16-r0 + ((uint64_t) iv_col << 24) | + ((uint64_t)(iv_row & 0x20000) << 13) ); // r17 +} + //------------------------------------------------------------------------------ // Address Accessor Functions //------------------------------------------------------------------------------ diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H index 4b1fc07fb..b0b86af04 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemAddress.H @@ -87,12 +87,22 @@ class MemAddr static MemAddr fromReadAddr( uint64_t i_addr ); /** - * @brief Creates a MemAddr from a maintenance address. + * @brief Creates a MemAddr from the current maintenance address. * @param i_addr 64-bit address. */ template static MemAddr fromMaintAddr( uint64_t i_addr ); + /** + * @brief Converts internal data structure to a maintenance address. + * @return A uint64_t version of the address. + * @note Does not include error type. This is because in most cases we + * will use this function to write out to hardware and in doing so + * we will want to clear the status bits anyway. + */ + template + uint64_t toMaintAddr() const; + /** @return This address's rank. */ const MemRank& getRank() const { return iv_rnk; } diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C index 5b7e72c03..43fd84545 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.C @@ -385,6 +385,146 @@ uint32_t checkEccFirs( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ +template<> +uint32_t conditionallyClearEccCounters( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[conditionallyClearEccCounters] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // Check for maintenance ECC errors. + uint32_t eccAttns = 0; + o_rc = checkEccFirs( i_chip, eccAttns ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "checkEccFirs(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); + uint8_t mbaPos = i_chip->getPos(); + + const char * ec0Reg_str = (0 == mbaPos) ? "MBA0_MBSEC0" : "MBA1_MBSEC0"; + SCAN_COMM_REGISTER_CLASS * ec0Reg = membChip->getRegister( ec0Reg_str ); + o_rc = ec0Reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", ec0Reg_str ); + break; + } + + const char * mbstr_str = (0 == mbaPos) ? "MBSTR_0" : "MBSTR_1"; + SCAN_COMM_REGISTER_CLASS * mbstr = membChip->getRegister( mbstr_str ); + o_rc = mbstr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", mbstr_str ); + break; + } + + bool updateEc0 = false; + bool clearSymCntrs = false; + + if ( eccAttns & MAINT_SOFT_NCE_ETE ) + { + // Clear Soft CE total count. + ec0Reg->SetBitFieldJustified( 0, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(55) ) clearSymCntrs = true; + } + + if ( eccAttns & MAINT_INT_NCE_ETE ) + { + // Clear Intermittent CE total count. + ec0Reg->SetBitFieldJustified( 12, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(56) ) clearSymCntrs = true; + } + + if ( eccAttns & MAINT_HARD_NCE_ETE ) + { + // Clear the hard CE total count. + ec0Reg->SetBitFieldJustified( 24, 12, 0 ); + updateEc0 = true; + + if ( mbstr->IsBitSet(57) ) clearSymCntrs = true; + } + + if ( updateEc0 ) + { + o_rc = ec0Reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", ec0Reg_str ); + break; + } + } + + if ( clearSymCntrs ) + { + // Clear all of the per symbol counters. Note that there are a total + // of 9 MBSSYMECx registers (MBSSYMEC0-MBSSYMEC8) per MBA. + for ( uint8_t i = 0; i < 9; i++ ) + { + char reg_str[20]; + snprintf( reg_str, 20, "MBA%d_MBSSYMEC%d", mbaPos, i ); + + SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister(reg_str); + + reg->clearAllBits(); + + o_rc = reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); + break; + } + } + if ( SUCCESS != o_rc ) break; + } + + if ( eccAttns & MAINT_RCE_ETE ) + { + // Clear only the RCE total count. + const char * ec1Reg_str = + (0 == mbaPos) ? "MBA0_MBSEC1" : "MBA1_MBSEC1"; + SCAN_COMM_REGISTER_CLASS * ec1Reg = + membChip->getRegister( ec1Reg_str ); + + o_rc = ec1Reg->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on %s", ec1Reg_str ); + break; + } + + ec1Reg->SetBitFieldJustified( 0, 12, 0 ); + + o_rc = ec1Reg->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on %s", ec1Reg_str ); + break; + } + } + + } while(0); + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + template<> uint32_t setBgScrubThresholds( ExtensibleChip * i_chip, const MemRank & i_rank ) diff --git a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H index 72b52cbcc..bfa4e6a87 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H +++ b/src/usr/diag/prdf/plat/mem/prdfMemScrubUtils.H @@ -65,6 +65,15 @@ uint32_t clearCmdCompleteAttn( ExtensibleChip * i_chip ); template uint32_t clearEccCounters( ExtensibleChip * i_chip ); +/** + * @brief Calls checkEccFirs() and clears the maintenance ECC counters based on + * the active error types. + * @param i_chip MBA. + * @return Non-SUCCESS on SCOM failures, SUCCESS otherwise. + */ +template +uint32_t conditionallyClearEccCounters( ExtensibleChip * i_chip ); + /** * @brief Clears the maintenance ECC FIRs. * @param i_chip MBA, MCA, or MCBIST. diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C index 52ca3ef46..d0ad9b2f6 100644 --- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C +++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C @@ -34,6 +34,7 @@ // Framework includes #include #include +#include // Platform includes #include @@ -215,29 +216,28 @@ uint32_t stopBgScrub( ExtensibleChip * i_chip ) PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - uint32_t rc = SUCCESS; + uint32_t o_rc = SUCCESS; + + fapi2::Target fapiTrgt ( i_chip->getTrgt() ); + errlHndl_t errl = nullptr; - PRDF_ERR( PRDF_FUNC "function not implemented yet" ); -/* TODO RTC 157888 // It is safe to create a dummy command object because runtime commands do // not store anything for cleanupCmd() and the stopCmd() function is generic // for all command types. Also, since we are only stopping the command, all // of the parameters for the command object are junk except for the target. - ecmdDataBufferBase i_startAddr, i_endAddr; - mss_TimeBaseScrub cmd { getFapiTarget(i_trgt), i_startAddr, i_endAddr, + fapi2::buffer startAddr, endAddr; + mss_TimeBaseScrub cmd { fapiTrgt, startAddr, endAddr, mss_MaintCmd::FAST_MAX_BW_IMPACT, 0, false }; - - errlHndl_t errl = fapi::fapiRcToErrl( cmd.stopCmd() ); + FAPI_INVOKE_HWP( errl, cmd.stopCmd ); if ( nullptr != errl ) { PRDF_ERR( PRDF_FUNC "mss_TimeBaseScrub::stop(0x%08x) failed", - getHuid(i_trgt) ); + i_chip->getHuid() ); PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); - rc = FAIL; + o_rc = FAIL; } -*/ - return rc; + return o_rc; #undef PRDF_FUNC } @@ -262,6 +262,14 @@ uint32_t __resumeScrub( ExtensibleChip * i_chip, uint32_t o_rc = SUCCESS; + // Make sure there is a command complete attention when the command stops. + i_stopCond |= mss_MaintCmd::ENABLE_CMD_COMPLETE_ATTENTION; + + // Make sure the command stops immediately on error or on the end address if + // there are no errors. + i_stopCond |= mss_MaintCmd::STOP_IMMEDIATE; + i_stopCond |= mss_MaintCmd::STOP_ON_END_ADDRESS; + if ( getMbaDataBundle(i_chip)->iv_scrubResumeCounter.atTh() ) { // We have resumed scrubbing on this rank too many times. We still want @@ -277,11 +285,108 @@ uint32_t __resumeScrub( ExtensibleChip * i_chip, i_stopCond &= ~mss_MaintCmd::STOP_ON_UE; } + fapi2::Target fapiTrgt ( i_chip->getTrgt() ); + errlHndl_t errl = nullptr; + do { - // TODO: Clear ECC counters/FIRs. Increment the current address. Clear - // FIRs again. Start the command from the current address to the - // end of the rank. + // Manually clear the CE counters based on the error type and clear the + // maintenance FIRs. Note that we only want to clear counters that are + // at attention to allow the other CE types the opportunity to reach + // threshold, if possible. + o_rc = conditionallyClearEccCounters( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "conditionallyClearEccCounters(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearEccFirs( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearEccFirs(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearCmdCompleteAttn( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearCmdCompleteAttn(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + // Increment the current maintenance address. + mss_IncrementAddress incCmd { fapiTrgt }; + FAPI_INVOKE_HWP( errl, incCmd.setupAndExecuteCmd ); + if ( nullptr != errl ) + { + PRDF_ERR( PRDF_FUNC "mss_IncrementAddress setupAndExecuteCmd() on " + "0x%08x failed", i_chip->getHuid() ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; + break; + } + + // Clear the maintenance FIRs again. This time do not clear the CE + // counters. + o_rc = clearEccFirs( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearEccFirs(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + o_rc = clearCmdCompleteAttn( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearCmdCompleteAttn(0x%08x) failed", + i_chip->getHuid() ); + break; + } + + // The address register has been updated so we need to clear our cache + // to ensure we can do a new read. + SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( "MBMACA" ); + RegDataCache::getCachedRegisters().flush( i_chip, reg ); + + // Read the new start address from hardware. + MemAddr addr; + o_rc = getMemMaintAddr( i_chip, addr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + i_chip->getHuid() ); + break; + } + fapi2::buffer saddr = addr.toMaintAddr(); + + // Get the end address of the current rank. + fapi2::buffer eaddr, junk; + MemRank rank = addr.getRank(); + o_rc = getMemAddrRange( i_chip, rank, junk, eaddr, + i_rangeType ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemAddrRange(0x%08x,0x%2x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + + // Resume the scrub command. + mss_TimeBaseScrub scrubCmd { fapiTrgt, saddr, eaddr, i_cmdSpeed, + i_stopCond, false }; + FAPI_INVOKE_HWP( errl, scrubCmd.setupAndExecuteCmd ); + if ( nullptr != errl ) + { + PRDF_ERR( PRDF_FUNC "setupAndExecuteCmd() on 0x%08x,0x%02x failed", + i_chip->getHuid(), rank.getKey() ); + PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT ); + o_rc = FAIL; break; + } // Resume successful. So increment the resume counter. getMbaDataBundle(i_chip)->iv_scrubResumeCounter.inc(); @@ -301,10 +406,22 @@ uint32_t resumeBgScrub( ExtensibleChip * i_chip ) PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - /* TODO: + uint32_t stopCond = mss_MaintCmd::STOP_ON_HARD_NCE_ETE | + mss_MaintCmd::STOP_ON_INT_NCE_ETE | + mss_MaintCmd::STOP_ON_SOFT_NCE_ETE | + mss_MaintCmd::STOP_ON_RETRY_CE_ETE | + mss_MaintCmd::STOP_ON_MPE | + mss_MaintCmd::STOP_ON_UE; + + mss_MaintCmd::TimeBaseSpeed cmdSpeed = enableFastBgScrub() + ? mss_MaintCmd::FAST_MED_BW_IMPACT + : mss_MaintCmd::BG_SCRUB; + + // Because of the Centaur workarounds, we have to limit the number of times + // a command has been resumed on a rank. Therefore, we must always resume + // the command to the end of the current slave rank. + return __resumeScrub( i_chip, SLAVE_RANK, stopCond, cmdSpeed ); - */ - return SUCCESS; } //------------------------------------------------------------------------------ @@ -317,10 +434,11 @@ uint32_t resumeTdScrub( ExtensibleChip * i_chip, PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MBA == i_chip->getType() ); - /* TODO: + mss_MaintCmd::TimeBaseSpeed cmdSpeed = enableFastBgScrub() + ? mss_MaintCmd::FAST_MAX_BW_IMPACT + : mss_MaintCmd::FAST_MIN_BW_IMPACT; + return __resumeScrub( i_chip, i_rangeType, i_stopCond, cmdSpeed ); - */ - return SUCCESS; } //############################################################################## -- cgit v1.2.1