diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2018-04-25 13:06:00 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-04-27 21:29:59 -0400 |
commit | 53a2981cd7ad6932677237dcc4a117391487e210 (patch) | |
tree | 58b346f1aaa517e44a074f99004a929bd03da204 | |
parent | 2583834bd212e5ef15bb54bf2cfc48fb571a0007 (diff) | |
download | talos-hostboot-53a2981cd7ad6932677237dcc4a117391487e210.tar.gz talos-hostboot-53a2981cd7ad6932677237dcc4a117391487e210.zip |
PRD: should only call mssIplUeIsolation() during MemDiags
Change-Id: I2952de3d8bbaa70476e535beaffd660cdefb6438
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57833
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57886
CI-Ready: Zane C. Shelley <zshelle@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C | 93 |
1 files changed, 54 insertions, 39 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index ec6a40127..6175a4c7c 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -152,63 +152,78 @@ uint32_t handleMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, { #if !defined(__HOSTBOOT_RUNTIME) && defined(__HOSTBOOT_MODULE) - // At IPL time we want to try avoiding calling out both DIMMs on a - // rank if possible, so we use mssIplUeIsolation to just callout - // the dimms with bad bits instead of calling out the entire rank. At - // runtime we can't do this to preserve data integrity. + MemRank rank = i_addr.getRank(); - MbaDataBundle * mbadb = getMbaDataBundle( i_chip ); - - MemDqBitmap<DIMMS_PER_RANK::MBA> l_dqBitmap; - o_rc = mssIplUeIsolation<DIMMS_PER_RANK::MBA>( i_chip->getTrgt(), - i_addr.getRank(), l_dqBitmap ); - if ( SUCCESS != o_rc ) + if ( isInMdiaMode() ) { - PRDF_ERR( PRDF_FUNC "mssIplUeIsolation(0x%08x, 0x%02x) failed", - i_chip->getHuid(), i_addr.getRank().getKey() ); - break; - } + // During MemDiags, we want to try avoiding calling out both DIMMs + // on a rank, if possible. So we use mssIplUeIsolation() to callout + // only the DIMMs with bad bits instead of calling out the entire + // rank. We cannot call this procedure once mainline traffic is + // running because it will modify contents of memory. - // Add UE data to capture data - l_dqBitmap.getCaptureData( io_sc.service_data->GetCaptureData() ); + MbaDataBundle * mbadb = getMbaDataBundle( i_chip ); - // Add all DIMMs with bad bits to the callout list. - for ( uint8_t ps = 0; ps < DIMMS_PER_RANK::MBA; ps++ ) - { - bool badDqs = false; - o_rc = l_dqBitmap.badDqs( badDqs, ps ); + MemDqBitmap<DIMMS_PER_RANK::MBA> l_dqBitmap; + o_rc = mssIplUeIsolation<DIMMS_PER_RANK::MBA>( i_chip->getTrgt(), + rank, l_dqBitmap ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "badDqs(%d) failed", ps ); + PRDF_ERR( PRDF_FUNC "mssIplUeIsolation(0x%08x, 0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); break; } - if ( !badDqs ) continue; + // Add UE data to capture data + l_dqBitmap.getCaptureData( io_sc.service_data->GetCaptureData() ); - TargetHandle_t l_dimm = getConnectedDimm( i_chip->getTrgt(), - i_addr.getRank(), ps ); - if ( l_dimm == nullptr ) continue; + // Add all DIMMs with bad bits to the callout list. + for ( uint8_t ps = 0; ps < DIMMS_PER_RANK::MBA; ps++ ) + { + bool badDqs = false; + o_rc = l_dqBitmap.badDqs( badDqs, ps ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "badDqs(%d) failed", ps ); + break; + } - io_sc.service_data->SetCallout( l_dimm, MRU_HIGH ); + if ( !badDqs ) continue; - if ( isMfgCeCheckingEnabled() ) - { - // As we are doing callout for UE, we dont need to do callout - // during CE for this rank on given port - mbadb->getIplCeStats()->banAnalysis( - i_addr.getRank().getDimmSlct(), ps ); + TargetHandle_t l_dimm = getConnectedDimm( i_chip->getTrgt(), + rank, ps ); + if ( l_dimm == nullptr ) continue; + + io_sc.service_data->SetCallout( l_dimm, MRU_HIGH ); + + if ( isMfgCeCheckingEnabled() ) + { + // Because this is a UE, no need to do further MNFG CE + // analysis on this rank. + mbadb->getIplCeStats()->banAnalysis(rank.getDimmSlct(), ps); + } } - } - // Make the error log predictive. - io_sc.service_data->setServiceCall(); + // Make the error log predictive. + io_sc.service_data->setServiceCall(); - // Add entry to UE table. - MemDbUtils::addUeTableEntry<TYPE_MBA>( i_chip, i_type, i_addr ); + // Add entry to UE table. + MemDbUtils::addUeTableEntry<TYPE_MBA>( i_chip, i_type, i_addr ); + } + else + { + o_rc = __handleMemUe<TYPE_MBA>( i_chip, i_addr, i_type, io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed", + i_chip->getHuid(), i_type ); + break; + } + } #else - o_rc = __handleMemUe<TYPE_MBA>( i_chip, i_addr, i_type, io_sc ); + o_rc = __handleMemUe<TYPE_MBA>( i_chip, i_addr, i_type, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed", |