From 4dee8a0a654536130b4886c114ba7fe1d23bcee0 Mon Sep 17 00:00:00 2001 From: Caleb Palmer Date: Wed, 11 Jul 2018 08:29:41 -0500 Subject: PRD: Row Repair VCM Updates Change-Id: I809cdc7fc2a010c5cb421bf5410d43f9bb690fee RTC: 196073 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/63375 Tested-by: Jenkins Server Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/64982 Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins --- src/usr/diag/prdf/plat/mem/prdfMemVcm.C | 214 +++++++++++++++++++++++++++++--- 1 file changed, 198 insertions(+), 16 deletions(-) (limited to 'src/usr/diag/prdf/plat/mem/prdfMemVcm.C') diff --git a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C index e0feeb362..5009b6aa6 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemVcm.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemVcm.C @@ -112,10 +112,169 @@ uint32_t VcmEvent::rowRepair( STEP_CODE_DATA_STRUCT & io_sc, { #define PRDF_FUNC "[VcmEvent::rowRepair] " + PRDF_ASSERT( iv_rowRepairEnabled ) + uint32_t o_rc = SUCCESS; do { + // get port select + uint8_t l_ps = iv_mark.getSymbol().getPortSlct(); + + // get if the spares are available + bool l_spAvail, l_eccAvail; + o_rc = PlatServices::isSpareAvailable( iv_chip->getTrgt(), + iv_rank, l_ps, l_spAvail, l_eccAvail ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "isChipMarkOnSpare(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + // get dimm + TARGETING::TargetHandle_t l_dimm = + PlatServices::getConnectedDimm( iv_chip->getTrgt(), iv_rank, + l_ps ); + + // If scrub stops on first MCE, and static row repair + // not supported or both spare and chip mark used + if ( 1 == iv_mceCount && ( !l_spAvail && !l_eccAvail ) ) + { + // Record bad DQs in VPD - done when verified() + // No need to continue scrubbing, VCM verified, VCM done. + o_done = true; + } + // Else if scrub stops on first MCE and static row repair + // supported + else if ( 1 == iv_mceCount ) + { + MemRowRepair l_rowRepair; + o_rc = getRowRepairData( l_dimm, iv_rank, l_rowRepair ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getRowRepairData(0x%08x, 0x%02x)", + PlatServices::getHuid(l_dimm), iv_rank.getKey() ); + break; + } + + // If the port, dimm, master rank has previous row repair in VPD + if ( l_rowRepair.isValid() ) + { + // If previous repair for same DRAM + if ( l_rowRepair.getRowRepairDram() == + iv_mark.getSymbol().getDram() ) + { + // Clear previous row repair from VPD + o_rc = clearRowRepairData( l_dimm, iv_rank ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "clearRowRepairData" + "(0x%08x, 0x%02x) failed", + PlatServices::getHuid(l_dimm), + iv_rank.getKey() ); + break; + } + + // Record bad DQs in VPD - done when verified() + // Signature: "VCM: verified: previous PPR on same DRAM" + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmVerSameDram ); + + // No need to continue scrubbing, VCM verified, VCM done + o_done = true; + } + // Else if previous repair for different DRAM + else + { + // Leave previous row repair in VPD + // Record bad DQs in VPD - done when verified() + // Signature:"VCM: verified: previous PPR on + // different DRAM" + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmVerDiffDram ); + + // No need to continue scrubbing, VCM verified, VCM done + o_done = true; + } + } + // Else if no previous row repair + else + { + // Signature: "VCM: verified: first MCE" + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmVerFirstMce ); + + // Record bad DQs in VPD - done when verified() + // Remember address + MemAddr l_addr; + o_rc = getMemMaintAddr( iv_chip, + iv_rowRepairFailAddr ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", + iv_chip->getHuid() ); + break; + } + + // Continue scrub, don't set procedure to done + } + } + // Else if scrub stops on second MCE + else if ( iv_mceCount > 1 ) + { + // Since at least 2 bad rows, don't bother with row repair + // No need to continue scrubbing, VCM verified, VCM done + o_done = true; + + // Signature: "VCM: verified: second MCE" + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmVerSecMce ); + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> +uint32_t VcmEvent::rowRepairEndRank( STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[VcmEvent::rowRepairEndRank] " + + PRDF_ASSERT( !iv_canResumeScrub ); + PRDF_ASSERT( iv_rowRepairEnabled ); + PRDF_ASSERT( 0 != iv_mceCount ); + + uint32_t o_rc = SUCCESS; + + do + { + // get dimm + uint8_t l_ps = iv_mark.getSymbol().getPortSlct(); + TARGETING::TargetHandle_t l_dimm = + PlatServices::getConnectedDimm( iv_chip->getTrgt(), iv_rank, + l_ps ); + + // If scrub gets to the end of the master rank with an MCE + // Update VPD with row repair + o_rc = setRowRepairData( l_dimm, iv_rank, + iv_rowRepairFailAddr, iv_mark.getSymbol().getDram() ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "setRowRepairData(0x%08x, 0x%02x) " + "failed", PlatServices::getHuid(l_dimm), + iv_rank.getKey() ); + break; + } + + // Signature: "VCM: verified: common row fail" + io_sc.service_data->setSignature( iv_chip->getHuid(), + PRDFSIG_VcmVerRowFail ); + + // VCM verified, VCM done } while (0); @@ -196,41 +355,64 @@ uint32_t VcmEvent::handlePhaseComplete( const uint32_t & i_eccAttns, { if ( i_eccAttns & MAINT_MCE ) { - if ( iv_rowRepairEnabled ) + iv_mceCount++; + + // Only need to call verified on the first mce we hit + if ( 1 == iv_mceCount ) { - o_rc = rowRepair( io_sc, o_done ); + o_rc = verified( io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "rowRepair() failed on 0x%08x", + PRDF_ERR( PRDF_FUNC "verified() failed on 0x%08x", iv_chip->getHuid() ); break; } } - else + + if ( iv_rowRepairEnabled ) { - o_rc = verified( io_sc ); + o_rc = rowRepair( io_sc, o_done ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "verified() failed on 0x%08x", + PRDF_ERR( PRDF_FUNC "rowRepair() failed on 0x%08x", iv_chip->getHuid() ); break; } - + if ( o_done ) break; + } + else + { o_done = true; // Procedure is complete. + break; } } - else if ( !iv_canResumeScrub ) + + if ( !iv_canResumeScrub ) { - // The chip mark is not verified and the command has reached the - // end of the rank. So this is a false alarm. - o_rc = falseAlarm( io_sc ); - if ( SUCCESS != o_rc ) + // If row repair is enabled, we reached the end of the rank, and + // we got an MCE, we need to apply the row repair. + if ( iv_rowRepairEnabled && 0 != iv_mceCount ) { - PRDF_ERR( PRDF_FUNC "falseAlarm() failed on 0x%08x", - iv_chip->getHuid() ); - break; + o_rc = rowRepairEndRank( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "rowRepairEndRank() failed on " + "0x%08x", iv_chip->getHuid() ); + break; + } + } + else + { + // The chip mark is not verified and the command has reached + // the end of the rank. So this is a false alarm. + o_rc = falseAlarm( io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "falseAlarm() failed on 0x%08x", + iv_chip->getHuid() ); + break; + } } - o_done = true; // Procedure is complete. } } -- cgit v1.2.1