diff options
| author | Zane Shelley <zshelle@us.ibm.com> | 2017-05-05 22:05:06 -0500 |
|---|---|---|
| committer | Zane C. Shelley <zshelle@us.ibm.com> | 2017-05-19 10:54:04 -0400 |
| commit | a12b4ce0769e07495726733c6d55a90358cf86bd (patch) | |
| tree | f127e5dfa0f87f2afc3508d408a79e1f1c4f544c /src/usr | |
| parent | 513e460747a3275fcbfd5deb585bfb2836f8fbc9 (diff) | |
| download | blackbird-hostboot-a12b4ce0769e07495726733c6d55a90358cf86bd.tar.gz blackbird-hostboot-a12b4ce0769e07495726733c6d55a90358cf86bd.zip | |
PRD: generic function for IUE attention handling
Change-Id: I0ed418f3934aaceee0e3949ad91af45879f9004d
RTC: 173944
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40423
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Brian J. Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamin J. Weisenbeck <bweisenb@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/40228
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr')
| -rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C | 174 | ||||
| -rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H | 36 | ||||
| -rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mca.C | 13 | ||||
| -rw-r--r-- | src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C | 6 |
4 files changed, 127 insertions, 102 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index 1aea86c2d..7ad37bcca 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -153,19 +153,49 @@ uint32_t handleMemUe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, #ifdef __HOSTBOOT_MODULE -uint32_t maskMemPort( ExtensibleChip * i_chip ) +template<> +uint32_t maskMemPort<TYPE_MCA>( ExtensibleChip * i_chip ) { - #define PRDF_FUNC "[MemEcc::maskMemPort] " + #define PRDF_FUNC "[MemEcc::maskMemPort<TYPE_MCA>] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); - SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR"); - SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR"); - SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR"); + uint32_t o_rc = SUCCESS; + + do + { + // Mask all FIRs on the port. + SCAN_COMM_REGISTER_CLASS * c = i_chip->getRegister("MCACALFIR_MASK_OR"); + SCAN_COMM_REGISTER_CLASS * d = i_chip->getRegister("DDRPHYFIR_MASK_OR"); + SCAN_COMM_REGISTER_CLASS * e = i_chip->getRegister("MCAECCFIR_MASK_OR"); + + c->setAllBits(); d->setAllBits(); e->setAllBits(); + + o_rc = c->Write() | d->Write() | e->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() ); + break; + } + + #ifdef __HOSTBOOT_RUNTIME - c->setAllBits(); d->setAllBits(); e->setAllBits(); + /* TODO RTC 136129 + // Dynamically deallocate the port. + o_rc = MemDealloc::port<TYPE_MCA>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::port<TYPE_MCA>(0x%08x) failed", + i_chip->getHuid() ); + } + */ + + #endif - return ( c->Write() | d->Write() | e->Write() ); + } while (0); + + return o_rc; #undef PRDF_FUNC } @@ -176,10 +206,13 @@ uint32_t maskMemPort( ExtensibleChip * i_chip ) #ifdef __HOSTBOOT_RUNTIME -uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc) +template<> +uint32_t iuePortFail<TYPE_MCA>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[MemEcc::iuePortFail] " + #define PRDF_FUNC "[MemEcc::iuePortFail<TYPE_MCA>] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -714,89 +747,68 @@ uint32_t analyzeFetchUe<TYPE_MCA, McaDataBundle *>( ExtensibleChip * i_chip, //------------------------------------------------------------------------------ -#ifdef __HOSTBOOT_MODULE - template<TARGETING::TYPE T, typename D> -uint32_t __analyzeIue( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc, - MemAddr i_addr ) +uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ) { - #define PRDF_FUNC "[MemEcc::__analyzeIue] " + #define PRDF_FUNC "[MemEcc::handleMemIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); + uint32_t o_rc = SUCCESS; - do - { - // get data bundle from chip - D db = static_cast<D>( i_chip->getDataBundle() ); + // Add the DIMM to the callout list. + MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); - // get the rank - MemRank rank = i_addr.getRank(); + #ifdef __HOSTBOOT_MODULE - TargetHandle_t trgt = i_chip->getTrgt(); + do + { + // Nothing else to do if handling a system checkstop. + if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break; - // Add the DIMM to the callout list - MemoryMru memmru(trgt, rank, MemoryMruData::CALLOUT_RANK); - io_sc.service_data->SetCallout( memmru ); + // Get the data bundle from chip. + D db = static_cast<D>( i_chip->getDataBundle() ); - uint8_t ds = rank.getDimmSlct(); + // Get the DIMM select. + uint8_t ds = i_rank.getDimmSlct(); - // Initialize threshold if it doesn't exist yet + // Initialize threshold if it doesn't exist yet. if ( 0 == db->iv_iueTh.count(ds) ) { db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() ); } - // increment the threshold - check if at threshold + // Increment the count and check if at threshold. if ( db->iv_iueTh[ds].inc(io_sc) ) { - // Make the error log predictive + // Make the error log predictive. io_sc.service_data->setServiceCall(); - #ifdef __HOSTBOOT_RUNTIME - - /* TODO RTC 136129 - // Dynamically deallocate the rank. - uint32_t dealloc_rc = MemDealloc::rank<T>( i_chip, rank ); - if ( SUCCESS != dealloc_rc ) - { - PRDF_ERR( PRDF_FUNC "MemDealloc::rank() failed: i_chip=0x%08x " - "rank=m%ds%d", i_chip->getHuid(), rank.getMaster(), - rank.getSlave() ); - o_rc = dealloc_rc; break; - } - */ - - #endif // __HOSTBOOT_RUNTIME + // The port fail will be triggered in the PostAnalysis plugin after + // the error log has been committed. - // mask off the entire port to avoid collateral - o_rc = maskMemPort( i_chip ); + // Mask off the entire port to avoid collateral. + o_rc = MemEcc::maskMemPort<T>( i_chip ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<T>(0x%08x) failed", + i_chip->getHuid() ); break; } - - // Port fail will be triggered in PostAnalysis after the error log - // has been committed. } - }while(0); + } while (0); + + #endif // __HOSTBOOT_MODULE return o_rc; #undef PRDF_FUNC } -// To resolve template linker errors. -template -uint32_t __analyzeIue<TYPE_MCA, McaDataBundle*>(ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc, - MemAddr i_addr ); - -#endif // __HOSTBOOT_MODULE - //------------------------------------------------------------------------------ template<TARGETING::TYPE T, typename D> @@ -805,44 +817,39 @@ uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, { #define PRDF_FUNC "[MemEcc::analyzeMainlineIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); - uint32_t o_rc = SUCCESS; - #ifdef __HOSTBOOT_MODULE + uint32_t o_rc = SUCCESS; do { - - // get the address of the failure - MemAddr addr; - // Use the address in MBRCER. This address also traps IRCDs, but it is // not likely that we will have two independent failure modes at the // same time. So we just assume the address is correct. + MemAddr addr; o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_RCE_ADDR, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed", - i_chip->getHuid() ); + i_chip->getHuid() ); break; } + MemRank rank = addr.getRank(); - o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr ); + o_rc = handleMemIue<T,D>( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: 0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed", + i_chip->getHuid(), rank.getMaster(), rank.getSlave() ); break; } - }while(0); - - #endif + } while (0); return o_rc; #undef PRDF_FUNC - } // To resolve template linker errors. @@ -858,40 +865,37 @@ uint32_t analyzeMaintIue( ExtensibleChip * i_chip, { #define PRDF_FUNC "[MemEcc::analyzeMaintIue] " + PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); - uint32_t o_rc = SUCCESS; - #ifdef __HOSTBOOT_MODULE + uint32_t o_rc = SUCCESS; do { + // Use the current address in the MCBMCAT. MemAddr addr; - - // Use the current address in the MCBMCAT o_rc = getMemMaintAddr<T>( i_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", - i_chip->getHuid() ); + i_chip->getHuid() ); break; } + MemRank rank = addr.getRank(); - o_rc = __analyzeIue<T,D>( i_chip, io_sc, addr ); + o_rc = handleMemIue<T,D>( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { - PRDF_ERR( PRDF_FUNC "__analyzeIue failed. Chip HUID: " - "0x%08x", i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "handleMemIue<T,D>(0x%08x,m%ds%d) failed", + i_chip->getHuid(), rank.getMaster(), rank.getSlave() ); break; } - }while(0); - - #endif + } while (0); return o_rc; #undef PRDF_FUNC - } // To resolve template linker errors. diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H index 330fb2525..37beecdaf 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.H @@ -85,6 +85,27 @@ uint32_t handleMemUe( ExtensibleChip * i_chip, const MemAddr & i_addr, UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc ); /** + * @brief Does mainline and maintenance IUE handling. + * + * Adds the memory IUE to the callout list. At threshold, will make the error + * log predictive. When threshold is reached at runtime there is a good chance + * these IUEs are going to lead to a data integrity issue. Therefore, the port + * will be forced to fail, the entire port will be masked off, and dynamic + * memory deallocation will be applied. Note that this function will not issue + * the port failure because it is possible that it may crash the host. Instead, + * the port failure is issued in the PostAnalysis plugin after the error log has + * been committed. + * + * @param i_chip MCA chip. + * @param i_rank Rank containing the IUE. + * @param io_sc The step code data struct. + * @return Non-SUCCESS if an interal function fails, SUCCESS otherwise. + */ +template<TARGETING::TYPE T, typename D> +uint32_t handleMemIue( ExtensibleChip * i_chip, const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); + +/** * @brief Analyzes a fetch MPE attention. * @param i_chip MCA or MBA. * @param i_rank Target rank. @@ -158,22 +179,25 @@ uint32_t analyzeImpe( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); #ifdef __HOSTBOOT_RUNTIME /** - * @brief Will trigger a port fail if the number of IUEs is over threshold - * @param i_chip MCA chip - * @param io_sc The step code data struct. + * @brief Will trigger a port fail if the number of IUEs is over threshold. + * @param i_chip MCA chip + * @param io_sc The step code data struct. * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ -uint32_t iuePortFail(ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc); +template<TARGETING::TYPE T> +uint32_t iuePortFail( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); #endif // __HOSTBOOT_RUNTIME #ifdef __HOSTBOOT_MODULE /** - * @brief Will mask off the entire mem port - * @param i_chip MCA chip + * @brief Will mask off an entire memory port. At runtime will issue dynamic + * memory deallocation of the port. + * @param i_chip MCA chip * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise */ +template<TARGETING::TYPE T> uint32_t maskMemPort( ExtensibleChip * i_chip ); template<TARGETING::TYPE T, typename D> diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C index 7016b06bd..9b54037ba 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mca.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mca.C @@ -69,16 +69,14 @@ int32_t PostAnalysis( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ) #ifdef __HOSTBOOT_RUNTIME - // If the IUE threshold in our data bundle has been reached, we trigger // a port fail. Once we trigger the port fail, the system may crash // right away. Since PRD is running in the hypervisor, it is possible we // may not get the error log. To better our chances, we trigger the port // fail here after the error log has been committed. - if ( SUCCESS != MemEcc::iuePortFail(i_chip, io_sc) ) + if ( SUCCESS != MemEcc::iuePortFail<TYPE_MCA>(i_chip, io_sc) ) { - PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_chip=0x%08x", - i_chip->getHuid() ); + PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", i_chip->getHuid() ); } #endif // __HOSTBOOT_RUNTIME @@ -197,14 +195,13 @@ int32_t MemPortFailure( ExtensibleChip * i_chip, if ( CHECK_STOP != io_sc.service_data->getPrimaryAttnType() ) { - // The port is dead mask off the entire port. - uint32_t l_rc = MemEcc::maskMemPort( i_chip ); + // The port is dead. Mask off the entire port. + uint32_t l_rc = MemEcc::maskMemPort<TYPE_MCA>( i_chip ); if ( SUCCESS != l_rc ) { - PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort failed: i_chip=0x%08x", + PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort<TYPE_MCA>(0x%08x) failed", i_chip->getHuid() ); } - } return SUCCESS; // nothing to return to rule code diff --git a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C index 4a80c2203..1b017194c 100644 --- a/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C +++ b/src/usr/diag/prdf/plat/mem/prdfP9Mcbist.C @@ -112,10 +112,10 @@ int32_t PostAnalysis( ExtensibleChip * i_mcbChip, // if there's an IUE and we've reached threshold trigger a port fail if ( eccAttns & MAINT_IUE ) { - if ( SUCCESS != MemEcc::iuePortFail(mca, io_sc) ) + if ( SUCCESS != MemEcc::iuePortFail<TYPE_MCA>(mca, io_sc) ) { - PRDF_ERR( PRDF_FUNC "iuePortFail failed: i_mcbChip=" - "0x%08x", i_mcbChip->getHuid() ); + PRDF_ERR( PRDF_FUNC "iuePortFail(0x%08x) failed", + i_mcbChip->getHuid() ); } } } |

