diff options
author | Zane Shelley <zshelle@us.ibm.com> | 2013-07-02 15:50:48 -0500 |
---|---|---|
committer | A. Patrick Williams III <iawillia@us.ibm.com> | 2013-08-08 10:15:32 -0500 |
commit | 1ba4435e7dba11181fc98876ce558185c3f44499 (patch) | |
tree | 5580b21df73762356652dff5a6f26fd6cdc5d57d /src/usr/diag/prdf | |
parent | 011da4162142a73c222e87e2bb1037866f9c1344 (diff) | |
download | talos-hostboot-1ba4435e7dba11181fc98876ce558185c3f44499.tar.gz talos-hostboot-1ba4435e7dba11181fc98876ce558185c3f44499.zip |
PRD: add better callouts/FFDC for IPL analysis
Change-Id: Iad9349c3a0714915a4d3c5f29b6e49da9d823135
Reviewed-on: http://gfw160.austin.ibm.com:8080/gerrit/5419
Tested-by: Jenkins Server
Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com>
Reviewed-by: Christopher T. Phan <cphan@us.ibm.com>
Reviewed-by: A. Patrick Williams III <iawillia@us.ibm.com>
Reviewed-on: http://gfw160.austin.ibm.com:8080/gerrit/5725
Diffstat (limited to 'src/usr/diag/prdf')
15 files changed, 512 insertions, 278 deletions
diff --git a/src/usr/diag/prdf/common/framework/rule/prdrRegister.H b/src/usr/diag/prdf/common/framework/rule/prdrRegister.H index a19e8aa4a..b0060bea7 100755 --- a/src/usr/diag/prdf/common/framework/rule/prdrRegister.H +++ b/src/usr/diag/prdf/common/framework/rule/prdrRegister.H @@ -55,6 +55,7 @@ std::cout<<"Failed to write data to file"; \ exit(1); \ } + namespace PRDR_COMPILER { @@ -141,7 +142,7 @@ struct CaptureReqStruct str = str.substr(1, str.size() - 2); } l_tmp32 = htonl(PRDF::Util::hashString(str.c_str())); - fwrite(&l_tmp32, sizeof(l_tmp32), 1, l_file); + PRDR_FWRITE(&l_tmp32, sizeof(l_tmp32), 1, l_file); } }; }; diff --git a/src/usr/diag/prdf/common/framework/service/prdfPfa5Data.h b/src/usr/diag/prdf/common/framework/service/prdfPfa5Data.h index 9e79aa1c8..94eb66fd0 100755 --- a/src/usr/diag/prdf/common/framework/service/prdfPfa5Data.h +++ b/src/usr/diag/prdf/common/framework/service/prdfPfa5Data.h @@ -28,7 +28,7 @@ @brief Version 5 format of the Pfa Data */ -#include <prdf_types.h> +#include <iipconst.h> #include <utilstream.H> namespace PRDF @@ -61,6 +61,8 @@ enum ErrlSubsect ErrlCapData_2 = 2, ErrlAVPData_1 = 41, ErrlAVPData_2 = 42, + ErrlMruData_1 = 61, // This will only be used in non-attenion code when + // we want to add MRU. ErrlString = 10, }; diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.C index 2a5490b0e..82d3ef9cd 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.C @@ -33,6 +33,28 @@ namespace PRDF using namespace PlatServices; +bool CenDqBitmap::badDqs() const +{ + bool o_badDqs = false; + + for ( uint32_t i = 0; i < PORT_SLCT_PER_MBA; i++ ) + { + for ( uint32_t j = 0; j < DIMM_DQ_RANK_BITMAP_SIZE; j++ ) + { + if ( 0 != iv_data[i][j] ) + { + o_badDqs = true; + break; + } + } + if ( o_badDqs ) break; + } + + return o_badDqs; +} + +//------------------------------------------------------------------------------ + int32_t CenDqBitmap::badDqs( uint8_t i_portSlct, bool & o_badDqs ) const { #define PRDF_FUNC "[CenDqBitmap::badDqs] " diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.H index 225ac1a35..340c872bd 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenDqBitmap.H @@ -60,6 +60,12 @@ class CenDqBitmap public: // functions /** + * @brief Queries if there are any bad DQs present on either port. + * @return TRUE if any bad DQs present. + */ + bool badDqs() const; + + /** * @brief Queries the given port to determine if there are any bad DQs * present. * @param i_portSlct The target port. diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMba.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMba.C index f41fcca1b..1ff6265eb 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMba.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMba.C @@ -103,7 +103,7 @@ int32_t MaintCmdComplete( ExtensibleChip * i_mbaChip, // successful with no errors because the error log will not be // committed. if ( !i_sc.service_data->IsDontCommitErrl() ) - CenMbaCaptureData::addDramRepairsData( mbaTarget, i_sc ); + CenMbaCaptureData::addMemEccData( mbaTarget, i_sc ); return PRD_NO_CLEAR_FIR_BITS; // FIR bits are cleared by this plugin diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.C b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.C index e58748121..ff1d2a6af 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.C @@ -27,40 +27,57 @@ */ #include <prdfCenMbaCaptureData.H> + +// Framwork includes #include <utilmem.H> #include <UtilHash.H> #include <prdfDramRepairUsrData.H> #include <iipServiceDataCollector.h> #include <prdf_ras_services.H> +// Pegasus includes #include <prdfCenMarkstore.H> +#include <prdfCenDqBitmap.H> + +using namespace TARGETING; namespace PRDF { +using namespace PlatServices; + namespace CenMbaCaptureData { -// ---------------------------------------------------------------------------- +//------------------------------------------------------------------------------ -void addDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, - errlHndl_t o_errHdl ) +void addMemEccData( TargetHandle_t i_mba, errlHndl_t io_errl ) { CaptureData cd; - captureDramRepairsData( i_mbaTarget, cd); - ErrDataService::AddCapData( cd, o_errHdl ); + + // Add DRAM repairs data from hardware. + captureDramRepairsData( i_mba, cd ); + + // Add DRAM repairs data from VPD. + captureDramRepairsVpd( i_mba, cd ); + + ErrDataService::AddCapData( cd, io_errl ); } -// ---------------------------------------------------------------------------- +//------------------------------------------------------------------------------ -void addDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, - STEP_CODE_DATA_STRUCT & io_sc ) +void addMemEccData( TargetHandle_t i_mba, STEP_CODE_DATA_STRUCT & io_sc ) { CaptureData & cd = io_sc.service_data->GetCaptureData(); - captureDramRepairsData( i_mbaTarget, cd); + + // Add DRAM repairs data from hardware. + captureDramRepairsData( i_mba, cd ); + + // Add DRAM repairs data from VPD. + captureDramRepairsVpd( i_mba, cd ); } -// ---------------------------------------------------------------------------- +//------------------------------------------------------------------------------ void captureDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, CaptureData & o_cd ) @@ -141,6 +158,62 @@ void captureDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, } } +//------------------------------------------------------------------------------ + +void captureDramRepairsVpd( TargetHandle_t i_mba, CaptureData & io_cd ) +{ + #define PRDF_FUNC "[captureDramRepairsVpd] " + + // Get the maximum capture data size. + static const size_t sz_rank = sizeof(uint8_t); + static const size_t sz_entry = PORT_SLCT_PER_MBA * DIMM_DQ_RANK_BITMAP_SIZE; + static const size_t sz_word = sizeof(CPU_WORD); + + // Get the maximum capture data size. + size_t sz_maxData = MAX_RANKS_PER_MBA * (sz_rank + sz_entry); + + // Adjust the size for endianess. + sz_maxData = ((sz_maxData + sz_word-1) / sz_word) * sz_word; + + // Initialize to 0. + uint8_t capData[sz_maxData]; + memset( capData, 0x00, sz_maxData ); + + // Get the data for each rank. + uint32_t idx = 0; + for ( uint8_t r = 0; r < MAX_RANKS_PER_MBA; r++ ) + { + CenRank rank ( r ); + CenDqBitmap bitmap; + + if ( SUCCESS != getBadDqBitmap(i_mba, rank, bitmap, true) ) + { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), r ); + continue; // skip this rank + } + + if ( bitmap.badDqs() ) // make sure the data is non-zero + { + // Add the rank, then the entry data. + capData[idx] = r; idx += sz_rank; + memcpy(&capData[idx], bitmap.getData(), sz_entry); idx += sz_entry; + } + } + + // Fix endianess issues with non PPC machines. + size_t sz_capData = idx; + sz_capData = ((sz_capData + sz_word-1) / sz_word) * sz_word; + for ( uint32_t i = 0; i < (sz_capData/sz_word); i++ ) + ((CPU_WORD*)capData)[i] = htonl(((CPU_WORD*)capData)[i]); + + // Add data to capture data. + BIT_STRING_ADDRESS_CLASS bs ( 0, sz_capData*8, (CPU_WORD *) &capData ); + io_cd.Add( i_mba, Util::hashString("DRAM_REPAIRS_VPD"), bs ); + + #undef PRDF_FUNC +} + } //end namespace MbaCaptureData } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.H index 07d7fd99c..b264ee31c 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaCaptureData.H @@ -42,21 +42,24 @@ namespace CenMbaCaptureData { /** - * @brief Add DRAM repair data to error log. - * @param i_mbaTarget An MBA target. - * @param o_errHdl Error log handle. + * @brief Adds Memory ECC FFDC to an error log. + * @note This is intended to be used in non-normal analysis paths that do not + * have an SDC (i.e. Restore DRAM Repairs, IPL MNFG CE Isolation). + * @note The data will be limited and include things like the DRAM Repairs data + * but will not include stuff like the CE/UE tables, because that + * information is not relevent during its intented uses. + * @param i_mba Target MBA. + * @param o_errl Target error log. */ -void addDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, - errlHndl_t o_errHdl ); - +void addMemEccData( TARGETING::TargetHandle_t i_mba, errlHndl_t io_errl ); /** - * @brief Add DRAM repair data to SDC. - * @param i_mbaTarget An MBA target. - * @param io_sc The step code data struct. + * @brief Adds Memory ECC FFDC to an SDC. + * @param i_mba Target MBA. + * @param io_sc Target step code data struct. */ -void addDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, - STEP_CODE_DATA_STRUCT & io_sc ); +void addMemEccData( TARGETING::TargetHandle_t i_mba, + STEP_CODE_DATA_STRUCT & io_sc ); /** * @brief Queries hardware for all DRAM repairs data (chip/symbol marks, DRAM @@ -69,6 +72,19 @@ void addDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, */ void captureDramRepairsData( TARGETING::TargetHandle_t i_mbaTarget, CaptureData & o_cd ); + +/** + * @brief Queries the Bad DQ attributes for the content of the DRAM repairs VPD + * and add it to the capture data. + * @param i_mbaTarget An MBA target. + * @param o_cd Capture data struct. + * @note This function will be used to capture DRAM repair data into + * capture data struct. Other functions can call this function and + * update error log + */ +void captureDramRepairsVpd( TARGETING::TargetHandle_t i_mba, + CaptureData & o_cd ); + } // end namespace MbaCaptureData } // end namespace PRDF diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H index 3fa49f259..26cc84bdd 100644 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfCenMbaExtraSig.H @@ -45,4 +45,15 @@ PRDR_ERROR_SIGNATURE(VcmMarksUnavail, 0xffff0023, "", PRDR_ERROR_SIGNATURE(DsdDramSpared, 0xffff0030, "", "DSD: DRAM spared"); PRDR_ERROR_SIGNATURE(DsdBadSpare, 0xffff0031, "", "DSD: DRAM spare is bad"); +PRDR_ERROR_SIGNATURE(RdrInternalFail, 0xffff0040, "", "RDR: Internal failure"); +PRDR_ERROR_SIGNATURE(RdrInvalidConfig, 0xffff0041, "", "RDR: Invalid config"); +PRDR_ERROR_SIGNATURE(RdrScreenBadDqs, 0xffff0042, "", + "RDR: DRAM repairs disabled and VPD found"); +PRDR_ERROR_SIGNATURE(RdrRepairsUsed, 0xffff0043, "", + "RDR: Both spare and chip mark used"); +PRDR_ERROR_SIGNATURE(RdrRepairUnavail, 0xffff0044, "", + "RDR: Repairs needed but unavailable"); + + + #endif // __prdfCenMbaExtraSig_H diff --git a/src/usr/diag/prdf/common/plat/pegasus/prdfP8Proc.C b/src/usr/diag/prdf/common/plat/pegasus/prdfP8Proc.C index c2ae26936..02fdc5405 100755 --- a/src/usr/diag/prdf/common/plat/pegasus/prdfP8Proc.C +++ b/src/usr/diag/prdf/common/plat/pegasus/prdfP8Proc.C @@ -609,6 +609,8 @@ int32_t calloutPeerBus( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & i_sc, } while (0); return SUCCESS; + + #undef PRDF_FUNC } #define PLUGIN_CALLOUT_PEER_BUS( BUS, TYPE, POS ) \ diff --git a/src/usr/diag/prdf/common/prdfEnums.H b/src/usr/diag/prdf/common/prdfEnums.H index 7f6b907b8..ee2caff46 100755 --- a/src/usr/diag/prdf/common/prdfEnums.H +++ b/src/usr/diag/prdf/common/prdfEnums.H @@ -29,6 +29,10 @@ #ifdef __HOSTBOOT_MODULE #ifndef __PRD_RULE_COMPILE + + #include <errl/errlmanager.H> // Must be included before including + // hwasCallout.H, otherwise the compile will + // fail. #include <hwas/common/hwasCallout.H> // FIXME: RTC: 62867 will resolve this diff --git a/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C b/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C index cfcf67afa..2f2d88ea5 100644 --- a/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C +++ b/src/usr/diag/prdf/plat/pegasus/prdfDramRepairs.C @@ -34,9 +34,11 @@ #include "common/plat/pegasus/prdfCalloutUtil.H" #include "common/plat/pegasus/prdfCenDqBitmap.H" #include "common/plat/pegasus/prdfCenMarkstore.H" +#include "common/plat/pegasus/prdfCenMbaExtraSig.H" #include "common/plat/pegasus/prdfCenSymbol.H" #include "common/plat/pegasus/prdfMemoryMru.H" #include "framework/service/prdfPlatServices.H" +#include "plat/pegasus/prdfPlatCalloutUtil.H" using namespace HWAS; using namespace std; @@ -45,111 +47,76 @@ using namespace TARGETING; namespace PRDF { -static const uint8_t INVALID_SYMBOL = 0xff; +using namespace PlatServices; -bool validSymbol(uint8_t i_symbol) +namespace RDR // local utility functions to support PRDF::restoreDramRepairs() { - return i_symbol != INVALID_SYMBOL; -} -void commitRestoreCallout( void (*i_func)(errlHndl_t &, void *), void * i_data, - TargetHandle_t i_mba ) +// Creates and returns an error log. +errlHndl_t createErrl( uint32_t i_reasonCode, TargetHandle_t i_mba, + uint32_t i_signature ) { - PRDF_DENTER("commitRestoreCallout"); - - errlHndl_t err = NULL; - - PRDF_HW_CREATE_ERRL( - err, - ERRL_SEV_PREDICTIVE, - ERRL_ETYPE_NOT_APPLICABLE, - SRCI_MACH_CHECK, - SRCI_NO_ATTR, - PRDF_RESTORE_DRAM_REPAIR, - FSP_DEFAULT_REFCODE, - PRDF_DETECTED_FAIL_HARDWARE_PROBABLE, - 0, 0, 0, 0, // user data - HWSV_SYS_NO_TERMINATE, - false); // no pld check - - // add the callout - - (*i_func)(err, i_data); - - bool term = false; - - CenMbaCaptureData::addDramRepairsData( i_mba, err ); + uint64_t userdata12 = PRDF_GET_UINT64_FROM_UINT32( getHuid(i_mba), 0 ); + uint64_t userdata34 = PRDF_GET_UINT64_FROM_UINT32( i_signature, 0 ); + + // Note that the error log tags are not needed because PRD uses its own + // signature parser. + + return new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_PREDICTIVE, // severity + PRDF_RESTORE_DRAM_REPAIR, // module ID + i_reasonCode, // reason code + userdata12, // user data 1 & 2 + userdata34 ); // user data 3 & 4 +} - PRDF_HW_COMMIT_ERRL( - term, - err, - HWSV::HWSV_DECONFIG_DEFER, - ERRL_ACTION_REPORT, - HWSV_CONTINUE); +//------------------------------------------------------------------------------ - if(term) +// If an error log is given, will add DRAM repairs FFDC and traces to error log, +// then commit the error log. +void commitErrl( errlHndl_t i_errl, TargetHandle_t i_mba ) +{ + if ( NULL != i_errl ) { - // FIXME...this is a little goofy. - // Should be scrubbed with RTC 51552 + // Add capture data + CenMbaCaptureData::addMemEccData( i_mba, i_errl ); - PRDF_COMMIT_ERRL(err, ERRL_ACTION_REPORT); + // Add traces + i_errl->collectTrace( PRDF_COMP_NAME, 512 ); + + // Commit the error log + ERRORLOG::errlCommit( i_errl, PRDF_COMP_ID ); } } -void addMemMruCallout(errlHndl_t & io_log, void * i_memMru) -{ - PRDF_DENTER("addMemMruCallout"); +//------------------------------------------------------------------------------ - if ( NULL != i_memMru ) +// If there were analysis errors, will create and commit an error log with 2nd +// level support callout. +void commitSoftError( uint32_t i_reasonCode, TargetHandle_t i_mba, + uint32_t i_signature, bool i_analysisErrors ) +{ + if ( i_analysisErrors ) { - MemoryMru *memMru = static_cast<MemoryMru *>(i_memMru); - - TargetHandleList partList = memMru->getCalloutList(); - for ( TargetHandleList::iterator it = partList.begin(); - it != partList.end(); it++ ) - { - PRDF_HW_ADD_CALLOUT( - *it, - SRCI_PRIORITY_HIGH, - HWSV::HWSV_DECONFIG, - HWSV::HWSV_DECONFIG_GARD, - io_log, - false, // don't write src to vpd - GARD_Predictive, - ERRL_SEV_PREDICTIVE, - false); // don't update hcdb - } + errlHndl_t errl = createErrl( i_reasonCode, i_mba, i_signature ); + errl->addProcedureCallout( EPUB_PRC_LVL_SUPP, SRCI_PRIORITY_HIGH ); + commitErrl( errl, i_mba ); } } -void addDimmCallout(errlHndl_t & io_log, void * i_dimm) -{ - PRDF_DENTER("addDimmCallout"); - - PRDF_HW_ADD_CALLOUT( - static_cast<TargetHandle_t>(i_dimm), - SRCI_PRIORITY_HIGH, - HWSV::HWSV_DECONFIG, - HWSV::HWSV_DECONFIG_GARD, - io_log, - false, // don't write src to vpd - GARD_Predictive, - ERRL_SEV_PREDICTIVE, - false); // don't update hcdb -} +//------------------------------------------------------------------------------ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) { - PRDF_DENTER("processRepairedRanks: %p, 0x%02x", - i_mba, i_repairedRankMask); + #define PRDF_FUNC "[processRepairedRanks] " - // check the argument ranks for repairs - // that violate RAS policy + // The bits in i_repairedRankMask represent ranks that have repairs. Query + // hardware and compare against RAS policies. - bool calloutMade = false; + bool o_calloutMade = false; + bool analysisErrors = false; - // check each rank for repairs - // that violate RAS policy + errlHndl_t errl = NULL; // Initially NULL, will create if needed. for ( uint8_t r = 0; r < MAX_RANKS_PER_MBA; ++r ) { @@ -161,15 +128,21 @@ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) CenRank rank ( r ); CenMark mark; - if ( SUCCESS != PlatServices::mssGetMarkStore(i_mba, rank, mark) ) + if ( SUCCESS != mssGetMarkStore(i_mba, rank, mark) ) { + PRDF_ERR( PRDF_FUNC"mssGetMarkStore() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } CenSymbol sp0, sp1, sp; - if ( SUCCESS != PlatServices::mssGetSteerMux(i_mba, rank, sp0, sp1, sp)) + if ( SUCCESS != mssGetSteerMux(i_mba, rank, sp0, sp1, sp)) { + PRDF_ERR( PRDF_FUNC"mssGetSteerMux() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } @@ -179,112 +152,110 @@ bool processRepairedRanks( TargetHandle_t i_mba, uint8_t i_repairedRankMask ) // This rank has both a steer and a chip mark. Call out the DIMM // with the chip mark. - MemoryMru memoryMru( i_mba, rank, mark.getCM() ); - - commitRestoreCallout( &addMemMruCallout, &memoryMru, i_mba ); + if ( NULL == errl ) + { + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrRepairsUsed ); + } - calloutMade = true; + MemoryMru memoryMru( i_mba, rank, mark.getCM() ); + CalloutUtil::calloutMemoryMru( errl, memoryMru, + SRCI_PRIORITY_HIGH, + HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); + o_calloutMade = true; } } - PRDF_DEXIT("processRepairedRanks"); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); - return calloutMade; -} + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); -bool processBadDimms(TargetHandle_t i_mba, uint8_t i_badDimmMask) -{ - PRDF_DENTER("processBadDimms: %p, 0x%02x", i_mba, i_badDimmMask); + return o_calloutMade; - const struct DimmPortAssoc - { - uint8_t port; - uint8_t dimm; - uint8_t enc; - - } dimmPortAssoc[] = { - - {0, 0, 0x8}, - {0, 1, 0x4}, - {1, 0, 0x2}, - {1, 1, 0x1}, - }; - - uint64_t calloutCount = 0; + #undef PRDF_FUNC +} - // callout the argument dimms +//------------------------------------------------------------------------------ - // get all the dimms connected to this MBA +bool processBadDimms( TargetHandle_t i_mba, uint8_t i_badDimmMask ) +{ + #define PRDF_FUNC "[processBadDimms] " - TARGETING::TargetHandleList dimms = PlatServices::getConnected( - i_mba, TARGETING::TYPE_DIMM); + // The bits in i_badDimmMask represent DIMMs that have exceeded the + // available repairs. Callout these DIMMs. - // convert the encoded dimms that had too many repairs to - // dimm targets + bool o_calloutMade = false; + bool analysisErrors = false; - TargetHandleList::iterator dit = dimms.end(); + errlHndl_t errl = NULL; // Initially NULL, will create if needed. - while(dit-- != dimms.begin()) + // Iterate the list of all DIMMs be + TargetHandleList dimms = getConnected( i_mba, TYPE_DIMM ); + for ( TargetHandleList::iterator i = dimms.begin(); i < dimms.end(); i++ ) { uint8_t port = 0, dimm = 0; - if(SUCCESS != PlatServices::getMbaPort(*dit, port)) + if ( SUCCESS != getMbaPort(*i, port) ) { - // skip this dimm - continue; + PRDF_ERR( PRDF_FUNC"getMbaPort() failed: DIMM=0x%08x", getHuid(*i)); + analysisErrors = true; + continue; // skip this dimm } - if(SUCCESS != PlatServices::getMbaDimm(*dit, dimm)) + if ( SUCCESS != getMbaDimm(*i, dimm) ) { - // skip this dimm - continue; + PRDF_ERR( PRDF_FUNC"getMbaDimm() failed: DIMM=0x%08x", getHuid(*i)); + analysisErrors = true; + continue; // skip this dimm } - // see if the passed in dimm - // was flagged as bad by the restore procedure - - bool match = false; + // The 4 bits of i_badDimmMask is defined as p0d0, p0d1, p1d0, and p1d1. + uint8_t mask = 0x8 >> (port * PORT_SLCT_PER_MBA + dimm); - const DimmPortAssoc * it = dimmPortAssoc - + sizeof(dimmPortAssoc)/sizeof(*dimmPortAssoc); - - while(!match && it-- != dimmPortAssoc) + if ( 0 != (i_badDimmMask & mask) ) { - if(i_badDimmMask & it->enc - && port == it->port - && dimm == it->dimm) + if ( NULL == errl ) { - // this dimm is a match - - match = true; + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrRepairUnavail ); } - } - // call them out - - if(match) - { - ++calloutCount; - commitRestoreCallout( &addDimmCallout, *dit, i_mba ); + o_calloutMade = true; + errl->addHwCallout( *i, SRCI_PRIORITY_HIGH, HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); } } - PRDF_DEXIT("processBadDimms: bad dimm count: %d", calloutCount); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); - return 0 != calloutCount; + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); + + return o_calloutMade; + + #undef PRDF_FUNC } -bool processDq(TargetHandle_t i_mba) +//------------------------------------------------------------------------------ + +bool screenBadDqs( TargetHandle_t i_mba ) { - using namespace TARGETING; - using namespace PlatServices; + #define PRDF_FUNC "[screenBadDqs] " - PRDF_DENTER("processDq: %p", i_mba); + // Callout any attached DIMMs that have any bad DQs. - // callout any dimms on the argument MBA - // that have any bad dq + bool o_calloutMade = false; + bool analysisErrors = false; - uint64_t calloutCount = 0; + errlHndl_t errl = NULL; // Initially NULL, will create if needed. for ( uint32_t r = 0; r < MAX_RANKS_PER_MBA; r++ ) { @@ -293,6 +264,9 @@ bool processDq(TargetHandle_t i_mba) if ( SUCCESS != getBadDqBitmap(i_mba, rank, bitmap, true) ) { + PRDF_ERR( PRDF_FUNC"getBadDqBitmap() failed: MBA=0x%08x rank=%d", + getHuid(i_mba), rank.flatten() ); + analysisErrors = true; continue; // skip this rank } @@ -301,175 +275,170 @@ bool processDq(TargetHandle_t i_mba) bool badDqs = false; if ( SUCCESS != bitmap.badDqs(p, badDqs) ) { + PRDF_ERR( PRDF_FUNC"badDqs() failed: MBA=0x%08x rank=%d " + "port=%d", getHuid(i_mba), rank.flatten(), p ); + analysisErrors = true; continue; // skip this DIMM } if ( !badDqs ) { - continue; // skip this DIMM + continue; // nothing to do, skip this DIMM } TargetHandleList list = CalloutUtil::getConnectedDimms( i_mba, rank, p ); if ( 0 == list.size() ) { - PRDF_ERR( "[processDq] bad bits present but no connected " - "DIMM: MBA=0x%08x rank=%d port=%d", getHuid(i_mba), + PRDF_ERR( PRDF_FUNC"bad bits present but no connected DIMM: " + "MBA=0x%08x rank=%d port=%d", getHuid(i_mba), rank.flatten(), p ); - continue; + analysisErrors = true; + continue; // skip this DIMM } for ( TargetHandleList::iterator i = list.begin(); i < list.end(); i++ ) { - ++calloutCount; - commitRestoreCallout( &addDimmCallout, *i, i_mba ); + if ( NULL == errl ) + { + errl = createErrl( PRDF_DETECTED_FAIL_HARDWARE, i_mba, + PRDFSIG_RdrScreenBadDqs ); + } + + o_calloutMade = true; + errl->addHwCallout( *i, SRCI_PRIORITY_HIGH, + HWAS::DELAYED_DECONFIG, + HWAS::GARD_Predictive ); } } } - PRDF_DEXIT("processDq: bad dq dimm count: %d", calloutCount); + // Commit the error log, if needed. + commitErrl( errl, i_mba ); + + // Commit an additional error log indicating something failed in the + // analysis, if needed. + commitSoftError( PRDF_DETECTED_FAIL_SOFTWARE, i_mba, + PRDFSIG_RdrInternalFail, analysisErrors ); + + return o_calloutMade; - return 0 != calloutCount; + #undef PRDF_FUNC } -void deployDramSpares(TargetHandle_t i_mba) -{ - using namespace fapi; +//------------------------------------------------------------------------------ - bool x4 = PlatServices::isDramWidthX4(i_mba); +void deployDramSpares( TargetHandle_t i_mba ) +{ + bool x4 = isDramWidthX4(i_mba); for ( uint32_t r = 0; r < MAX_RANKS_PER_MBA; r++ ) { CenRank rank ( r ); - CenSymbol symbol = CenSymbol::fromSymbol( i_mba, rank, 0 ); - // ignore errors from putSteerMux + // Doesn't matter which DRAM is spared as long as they are all spared. + // Also, make sure the ECC spare is on a different DRAM than the spare + // DRAM. + CenSymbol symPort0 = CenSymbol::fromDimmDq( i_mba, rank, 0, 0 ); + CenSymbol symPort1 = CenSymbol::fromDimmDq( i_mba, rank, 0, 1 ); + CenSymbol symEccSp = CenSymbol::fromDimmDq( i_mba, rank, 8, 0 ); + + int32_t l_rc = SUCCESS; - static_cast<void>( - PlatServices::mssSetSteerMux(i_mba, rank, symbol, false) ); + l_rc = mssSetSteerMux( i_mba, rank, symPort0, false ); + l_rc |= mssSetSteerMux( i_mba, rank, symPort1, false ); - if( x4 ) + if ( x4 ) + l_rc |= mssSetSteerMux( i_mba, rank, symEccSp, true ); + + if ( SUCCESS != l_rc ) { - static_cast<void>( - PlatServices::mssSetSteerMux(i_mba, rank, symbol, true) ); + // mssSetSteerMux() will print a trace and commit the error log, + // however, we need to handle the return code or we get a compile + // warning in Hostboot. + continue; } } } +} // end namespace RDR + //------------------------------------------------------------------------------ // External functions - declared in prdfMain.H //------------------------------------------------------------------------------ int32_t restoreDramRepairs( TargetHandle_t i_mba ) { - PRDF_ENTER( "restoreDramRepairs(0x%08x)", PlatServices::getHuid(i_mba) ); - - bool calloutMade = false; + #define PRDF_FUNC "PRDF::restoreDramRepairs" - uint8_t repairedRankMask = 0, badDimmMask = 0; + PRDF_ENTER( PRDF_FUNC"(0x%08x)", getHuid(i_mba) ); - do { + bool calloutMade = false; - if(PlatServices::isMemoryPreservingIpl()) + do + { + if ( isMemoryPreservingIpl() ) { - // nothing to do in MPIPL - + // Power is preserved on a Centaur for a MPIPL. So the marks and + // spares will not need to be restored. break; } - bool spareDramDeploy = PlatServices::mnfgSpareDramDeploy(); + bool spareDramDeploy = mnfgSpareDramDeploy(); - if(spareDramDeploy) + if ( spareDramDeploy ) { - deployDramSpares(i_mba); + // Deploy all spares for MNFG corner tests. + RDR::deployDramSpares(i_mba); } - // in mfg mode, check dq and don't restore anything - - if(PlatServices::areDramRepairsDisabled()) + if ( areDramRepairsDisabled() ) { - if(processDq(i_mba)) - { - calloutMade = true; - } + // DRAM Repairs are disabled in MNFG mode, so screen all DIMMs with + // VPD information. + if ( RDR::screenBadDqs(i_mba) ) calloutMade = true; + // No need to continue because there will not be anything to + // restore. break; } - if(spareDramDeploy) + if ( spareDramDeploy ) { - // this is an error...the spare dram - // deploy bit was set but we weren't - // in mfg mode...log an error for MFG - - errlHndl_t err = NULL; - - PRDF_ERR( "[restoreDramRepairs] " - "The specified combination of mfg policy flags is invalid"); - - /*@ - * @errortype - * @reasoncode PRDF_INVALID_CONFIG - * @subsys EPUB_FIRMWARE_SUBSYS - * @moduleid PRDF_RESTORE_DRAM_REPAIR - * @devdesc The specified combination of policy flags is invalid. - */ - PRDF_CREATE_ERRL( - err, - ERRL_SEV_PREDICTIVE, - ERRL_ETYPE_NOT_APPLICABLE, - SRCI_MACH_CHECK, - SRCI_NO_ATTR, - PRDF_RESTORE_DRAM_REPAIR, - FSP_DEFAULT_REFCODE, - PRDF_INVALID_CONFIG, - 0, 0, 0, 0); - PRDF_COMMIT_ERRL(err, ERRL_ACTION_REPORT); - - // assume mfg mode (no repairs) ... + // This is an error. The MNFG spare DRAM deply bit is set, but DRAM + // Repairs have not been disabled. - break; - } - - if(SUCCESS != PlatServices::mssRestoreDramRepairs( - i_mba, - repairedRankMask, - badDimmMask)) - { - // can't check anything if - // this doesn't work + PRDF_ERR( "["PRDF_FUNC"] MNFG spare deploy enabled, but DRAM " + "repairs are not disabled" ); - PRDF_ERR( "[restoreDramRepairs] " - "PlatServices::mssRestoreDramRepairs failed" ); + RDR::commitSoftError( PRDF_INVALID_CONFIG, i_mba, + PRDFSIG_RdrInvalidConfig, true ); - break; + break; // Assume user meant to disable DRAM repairs. } - // callout bad dimms - - if(processBadDimms( - i_mba, - badDimmMask)) + uint8_t rankMask = 0, dimmMask = 0; + if ( SUCCESS != mssRestoreDramRepairs(i_mba, rankMask, dimmMask) ) { - calloutMade = true; + // Can't check anything if this doesn't work. + PRDF_ERR( "["PRDF_FUNC"] mssRestoreDramRepairs() failed" ); + break; } - // check repaired ranks for - // RAS policy violations + // Callout DIMMs with too many bad bits and not enough repairs available + if ( RDR::processBadDimms(i_mba, dimmMask) ) calloutMade = true; - if(processRepairedRanks( - i_mba, - repairedRankMask)) - { - calloutMade = true; - } + // Check repaired ranks for RAS policy violations. + if ( RDR::processRepairedRanks(i_mba, rankMask) ) calloutMade = true; } while(0); - PRDF_EXIT( "restoreDramRepairs(0x%08x)", PlatServices::getHuid(i_mba) ); + PRDF_EXIT( PRDF_FUNC"(0x%08x)", getHuid(i_mba) ); return calloutMade ? FAIL : SUCCESS; + + #undef PRDF_FUNC } } // end namespace PRDF diff --git a/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.C b/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.C new file mode 100644 index 000000000..a7efe46cb --- /dev/null +++ b/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.C @@ -0,0 +1,67 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.C $ */ +/* */ +/* IBM CONFIDENTIAL */ +/* */ +/* COPYRIGHT International Business Machines Corp. 2013 */ +/* */ +/* p1 */ +/* */ +/* Object Code Only (OCO) source materials */ +/* Licensed Internal Code Source Materials */ +/* IBM HostBoot Licensed Internal Code */ +/* */ +/* The source code for this program is not published or otherwise */ +/* divested of its trade secrets, irrespective of what has been */ +/* deposited with the U.S. Copyright Office. */ +/* */ +/* Origin: 30 */ +/* */ +/* IBM_PROLOG_END_TAG */ + +/** @file prdfPlatCalloutUtil.C */ + +#include <prdfPlatCalloutUtil.H> + +// Framework includes +#include <prdfErrlUtil.H> +#include <prdfPfa5Data.h> +#include <prdfPlatServices.H> + +// Pegasus includes +#include <prdfMemoryMru.H> + +using namespace TARGETING; + +namespace PRDF +{ + +using namespace PlatServices; + +namespace CalloutUtil +{ + +void calloutMemoryMru( errlHndl_t io_errl, const MemoryMru & i_memmru, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardType ) +{ + // Add all parts to the error log. + TargetHandleList partList = i_memmru.getCalloutList(); + for ( TargetHandleList::iterator it = partList.begin(); + it != partList.end(); it++ ) + { + io_errl->addHwCallout( *it, i_priority, i_deconfigState, i_gardType ); + } + + // Add the MemoryMru to the capture data. + uint32_t tmpMru = i_memmru.toUint32(); + PRDF_ADD_FFDC( io_errl, &tmpMru, sizeof(tmpMru), ErrlVer1, ErrlMruData_1 ); +} + +} // end namespace CalloutUtil + +} // end namespace PRDF + diff --git a/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.H b/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.H new file mode 100644 index 000000000..17753da51 --- /dev/null +++ b/src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.H @@ -0,0 +1,64 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/diag/prdf/plat/pegasus/prdfPlatCalloutUtil.H $ */ +/* */ +/* IBM CONFIDENTIAL */ +/* */ +/* COPYRIGHT International Business Machines Corp. 2012,2013 */ +/* */ +/* p1 */ +/* */ +/* Object Code Only (OCO) source materials */ +/* Licensed Internal Code Source Materials */ +/* IBM HostBoot Licensed Internal Code */ +/* */ +/* The source code for this program is not published or otherwise */ +/* divested of its trade secrets, irrespective of what has been */ +/* deposited with the U.S. Copyright Office. */ +/* */ +/* Origin: 30 */ +/* */ +/* IBM_PROLOG_END_TAG */ + +#ifndef __prdfPlatCalloutUtil_H +#define __prdfPlatCalloutUtil_H + +/** @file prdfPlatCalloutUtil.H */ + +// Framework includes +#include <prdfEnums.H> + +namespace PRDF +{ + +class MemoryMru; + +namespace CalloutUtil +{ + +/** + * @brief Add all parts of a MemoryMru to the callout list of an error log. + * Also, adds the MemoryMru info to the capture data. + * + * This is only intended to be used by non-attention analysis code like Restore + * DRAM Repairs or MNFG IPL CE analysis. In these cases, there is no SDC to + * collect the callout info or capture data. + * + * @param io_errl The target error log. + * @param i_memmru The target MemoryMru. + * @param i_priority The callout priority + * @param i_deconfigState The deconfiguration state. + * @param i_gardType The GARD error type. + */ +void calloutMemoryMru( errlHndl_t io_errl, const MemoryMru & i_memmru, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardType ); + +} // end namespace CalloutUtil + +} // end namespace PRDF + +#endif // __prdfPlatCalloutUtil_H + diff --git a/src/usr/diag/prdf/prdfErrlUtil.H b/src/usr/diag/prdf/prdfErrlUtil.H index 9563758df..8089cabfb 100644 --- a/src/usr/diag/prdf/prdfErrlUtil.H +++ b/src/usr/diag/prdf/prdfErrlUtil.H @@ -32,10 +32,6 @@ * related declarations specific to hostboot. */ -/*--------------------------------------------------------------------*/ -/* Includes */ -/*--------------------------------------------------------------------*/ -#include <errl/errlmanager.H> #include <prdfEnums.H> /** diff --git a/src/usr/diag/prdf/prdf_hb_only.mk b/src/usr/diag/prdf/prdf_hb_only.mk index acb774ebe..a6d44d64c 100644 --- a/src/usr/diag/prdf/prdf_hb_only.mk +++ b/src/usr/diag/prdf/prdf_hb_only.mk @@ -33,5 +33,6 @@ PRDF_RULE_PLUGINS_PEGASUS_HB = \ ################################################################################ prd_pegasus_specific_HB = \ - prdfDramRepairs.o + prdfDramRepairs.o \ + prdfPlatCalloutUtil.o |