From 80f8bb319679426ca7707a626c8fcf7654836746 Mon Sep 17 00:00:00 2001 From: Zane Shelley Date: Fri, 9 Feb 2018 16:45:24 -0600 Subject: PRD: VPD not getting cleared when all repairs used up during memdiag Change-Id: I5eb3fd052f0bbd3958379a6020592d8ea272d7e4 CQ: SW416691 Backport: release-fips910 Backport: release-op910 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/53797 Reviewed-by: Caleb N. Palmer Tested-by: Jenkins Server Reviewed-by: Benjamin J. Weisenbeck Reviewed-by: Brian J. Stegmiller Reviewed-by: Zane C. Shelley Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/53888 Tested-by: FSP CI Jenkins Tested-by: Jenkins OP Build CI --- src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C | 33 ++------ .../diag/prdf/plat/mem/prdfRestoreDramRepairs.C | 92 +++++++++++++++------- 2 files changed, 69 insertions(+), 56 deletions(-) (limited to 'src/usr/diag/prdf/plat/mem') diff --git a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C index 16449039a..d6d0387f5 100644 --- a/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C +++ b/src/usr/diag/prdf/plat/mem/prdfMemDynDealloc.C @@ -627,14 +627,7 @@ int32_t dimmSlct( TargetHandle_t i_dimm ) uint64_t largestAddr = 0; MemAddr startAddr, endAddr; std::vector masterRanks; - uint8_t dimmSlct = 0; - - o_rc = getDimmSlct( i_dimm, dimmSlct ); - if ( SUCCESS != o_rc ) - { - PRDF_ERR( PRDF_FUNC "getDimmSlct failed" ); - break; - } + uint8_t dimmSlct = getDimmSlct( i_dimm ); getMasterRanks( tgt, masterRanks, dimmSlct ); @@ -690,17 +683,9 @@ bool isDimmPair( TargetHandle_t i_dimm1, TargetHandle_t i_dimm2 ) bool isDimmPair = false; do { - uint8_t dimm1Slct = 0; - uint8_t dimm2Slct = 0; - - int32_t rc = getDimmSlct( i_dimm1, dimm1Slct ); - rc |= getDimmSlct( i_dimm2, dimm2Slct ); + uint8_t dimm1Slct = getDimmSlct( i_dimm1 ); + uint8_t dimm2Slct = getDimmSlct( i_dimm2 ); - if( SUCCESS != rc ) - { - PRDF_ERR( PRDF_FUNC " getDimmSlct() failed" ); - break; - } isDimmPair = ( ( dimm1Slct == dimm2Slct ) && ( getConnectedParent( i_dimm1, T ) == getConnectedParent( i_dimm2, T ))); @@ -717,17 +702,9 @@ bool compareDimms( TargetHandle_t i_dimm1, TargetHandle_t i_dimm2 ) bool isSmall = false; do { - uint8_t dimm1Slct = 0; - uint8_t dimm2Slct = 0; - - int32_t rc = getDimmSlct( i_dimm1, dimm1Slct ); - rc |= getDimmSlct( i_dimm2, dimm2Slct ); + uint8_t dimm1Slct = getDimmSlct( i_dimm1 ); + uint8_t dimm2Slct = getDimmSlct( i_dimm2 ); - if( SUCCESS != rc ) - { - PRDF_ERR( PRDF_FUNC " getDimmSlct() failed" ); - break; - } TargetHandle_t tgt1 = getConnectedParent( i_dimm1, T ); TargetHandle_t tgt2 = getConnectedParent( i_dimm2, T ); diff --git a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C index fbac10c7a..3c34320b3 100644 --- a/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C +++ b/src/usr/diag/prdf/plat/mem/prdfRestoreDramRepairs.C @@ -93,6 +93,52 @@ void commitErrl( errlHndl_t i_errl, TargetHandle_t i_trgt ) //------------------------------------------------------------------------------ +template +void __calloutDimm( errlHndl_t & io_errl, TargetHandle_t i_portTrgt, + TargetHandle_t i_dimmTrgt ) +{ + #define PRDF_FUNC "[RDR::__calloutDimm] " + + PRDF_ASSERT( nullptr != i_portTrgt ); + PRDF_ASSERT( T == getTargetType(i_portTrgt) ); + + PRDF_ASSERT( nullptr != i_dimmTrgt ); + PRDF_ASSERT( TYPE_DIMM == getTargetType(i_dimmTrgt) ); + + // Callout the DIMM. + io_errl->addHwCallout( i_dimmTrgt, HWAS::SRCI_PRIORITY_HIGH, + HWAS::DELAYED_DECONFIG, HWAS::GARD_Predictive ); + + // Clear the VPD on this DIMM. The DIMM has been garded, but it is possible + // the customer will want to ungard the DIMM. Without clearing the VPD, the + // DIMM will be permanently garded because the customer has no ability to + // clear the VPD. Therefore, we will clear the VPD on this DIMM. If the + // customer takes the risk of ungarding the DIMM (that they should replace), + // the repairs will need to be rediscovered. + + std::vector ranks; + getMasterRanks( i_portTrgt, ranks, getDimmSlct(i_dimmTrgt) ); + + uint8_t data[D][DQ_BITMAP::BITMAP_SIZE]; + memset( data, 0x00, sizeof(data) ); + + for ( auto & rank : ranks ) + { + MemDqBitmap dqBitmap { i_portTrgt, rank, data }; + + if ( SUCCESS != setBadDqBitmap(i_portTrgt, rank, dqBitmap) ) + { + PRDF_ERR( PRDF_FUNC "setBadDqBitmap<%d>(0x%08x,0x%02x) failed", + D, getHuid(i_portTrgt), rank.getKey() ); + continue; + } + } + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + // If there were analysis errors, will create and commit an error log with 2nd // level support callout. template @@ -139,6 +185,11 @@ bool processRepairedRanks( TargetHandle_t i_trgt, } } + // Keep a list of DIMMs to callout. Note that we are using a map with + // the DIMM target as the key so that we can maintain a unique list. The + // map value has no significance. + std::map calloutList; + ExtensibleChip * mcaChip = (ExtensibleChip *)systemPtr->GetChip(i_trgt); for ( uint8_t r = 0; r < MASTER_RANKS_PER_PORT; ++r ) @@ -189,11 +240,9 @@ bool processRepairedRanks( TargetHandle_t i_trgt, MemoryMru mm( i_trgt, rank, sym ); // Add all parts to the error log. - for ( auto & part : mm.getCalloutList() ) + for ( auto & dimm : mm.getCalloutList() ) { - errl->addHwCallout( part, HWAS::SRCI_PRIORITY_HIGH, - HWAS::DELAYED_DECONFIG, - HWAS::GARD_Predictive ); + calloutList[dimm] = 1; } // Add the MemoryMru to the capture data. @@ -204,6 +253,13 @@ bool processRepairedRanks( TargetHandle_t i_trgt, } } + // Callout all DIMMs in the map. + for ( auto const & dimm : calloutList ) + { + __calloutDimm( errl, i_trgt, + dimm.first ); + } + // Commit the error log, if needed. commitErrl( errl, i_trgt ); @@ -357,15 +413,13 @@ bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask ) // Iterate the list of all DIMMs TargetHandleList dimms = getConnected( i_trgt, TYPE_DIMM ); - for ( auto & i : dimms ) + for ( auto & dimm : dimms ) { - uint8_t dimm = getTargetPosition( i ) % MAX_DIMM_PER_PORT; - // i_badDimmMask is defined as a 2-bit mask where a bit set means that // DIMM had more bad bits than could be repaired. Note: the value is // actually a 4-bit field for use with Centaur, but we only use the // first 2 bits of that field here. - uint8_t mask = 0x80 >> dimm; + uint8_t mask = 0x80 >> getDimmSlct(dimm); if ( 0 != (i_badDimmMask & mask) ) { @@ -375,10 +429,9 @@ bool processBadDimms( TargetHandle_t i_trgt, uint8_t i_badDimmMask ) PRDFSIG_RdrRepairUnavail ); } + __calloutDimm( errl, i_trgt, dimm ); + o_calloutMade = true; - errl->addHwCallout( i, HWAS::SRCI_PRIORITY_HIGH, - HWAS::DELAYED_DECONFIG, - HWAS::GARD_Predictive ); } } @@ -606,24 +659,7 @@ uint32_t restoreDramRepairs( TargetHandle_t i_trgt ) // Callout DIMMs with too many bad bits and not enough repairs available if ( RDR::processBadDimms(i_trgt, dimmMask) ) - { - // Clear VPD after callout of ISDIMMs - uint8_t data[DIMMS_PER_RANK::MCA][DQ_BITMAP::BITMAP_SIZE]; - memset( data, 0x00, sizeof(data) ); - for ( auto & rank : ranks ) - { - MemDqBitmap dqBitmap( i_trgt, rank, data ); - if ( SUCCESS != setBadDqBitmap( i_trgt, - rank, dqBitmap ) ) - { - PRDF_ERR( PRDF_FUNC "setBadDqBitmap" - "(0x%08x,0x%02x) failed.", getHuid(i_trgt), - rank.getKey() ); - continue; - } - } calloutMade = true; - } // Check repaired ranks for RAS policy violations. if ( RDR::processRepairedRanks(i_trgt, rankMask) ) -- cgit v1.2.1