diff options
-rw-r--r-- | src/include/usr/hwas/common/hwas.H | 11 | ||||
-rw-r--r-- | src/include/usr/hwas/common/hwasCommon.H | 11 | ||||
-rw-r--r-- | src/include/usr/hwas/common/hwas_reasoncodes.H | 3 | ||||
-rw-r--r-- | src/usr/hwas/common/hwas.C | 140 | ||||
-rw-r--r-- | src/usr/hwas/hwasPlatError.C | 14 |
5 files changed, 147 insertions, 32 deletions
diff --git a/src/include/usr/hwas/common/hwas.H b/src/include/usr/hwas/common/hwas.H index 138a51887..7289b8ecb 100644 --- a/src/include/usr/hwas/common/hwas.H +++ b/src/include/usr/hwas/common/hwas.H @@ -138,6 +138,17 @@ errlHndl_t checkMinimumHardware( bool *o_bootable = NULL); /** +* @brief Loop through processors, make sure all have the same EC level +* create an error log for any slave processor that does not match +* the master's EC level +* +* +* @return errlHndl_t Error returned will be a summary of all errors that +* occurred during the procedure, all PLIDs should match +*/ +errlHndl_t validateProcessorEcLevels(); + +/** * @brief Struct representing a particular target. Used by * invokePresentByAssoc to populate a vector of TargetInfo's for subsequent * use by deconfigPresentByAssoc diff --git a/src/include/usr/hwas/common/hwasCommon.H b/src/include/usr/hwas/common/hwasCommon.H index 7462571b4..946d2c7a8 100644 --- a/src/include/usr/hwas/common/hwasCommon.H +++ b/src/include/usr/hwas/common/hwasCommon.H @@ -348,6 +348,17 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl, const HWAS::callOutPriority i_priority); /** + * @brief wrapper function to add a procedure callout to an error log in a + * platform-specific manner. + * + */ +void platHwasErrorAddHWCallout(errlHndl_t & io_errl, + const TARGETING::ConstTargetHandle_t i_target, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardErrorType); + +/** * @brief wrapper function to update the plid in a platform-specific manner. * * If io_plid is non-zero then io_errl is updated with io_plid diff --git a/src/include/usr/hwas/common/hwas_reasoncodes.H b/src/include/usr/hwas/common/hwas_reasoncodes.H index 729743162..09d90a796 100644 --- a/src/include/usr/hwas/common/hwas_reasoncodes.H +++ b/src/include/usr/hwas/common/hwas_reasoncodes.H @@ -33,6 +33,7 @@ namespace HWAS MOD_PROCESS_CALLOUT = 0x02, MOD_CHECK_MIN_HW = 0x03, MOD_DECONFIG_TARGETS_FROM_GARD = 0x04, + MOD_VALIDATE_EC_LEVELS = 0x05, }; enum HwasReasonCode @@ -55,6 +56,8 @@ namespace HWAS RC_SYSAVAIL_MISSING_CRITICAL_RESOURCE = HWAS_COMP_ID | 0x0B, RC_SYSAVAIL_NO_MCAS_FUNC = HWAS_COMP_ID | 0x0C, RC_SYSAVAIL_NO_NX_FUNC = HWAS_COMP_ID | 0x0E, + RC_EC_MISMATCH = HWAS_COMP_ID | 0x0F, + RC_FAILED_EC_VALIDATION = HWAS_COMP_ID | 0x10, }; }; diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C index 30f3f8993..0ea59b4b4 100644 --- a/src/usr/hwas/common/hwas.C +++ b/src/usr/hwas/common/hwas.C @@ -445,9 +445,17 @@ errlHndl_t discoverTargets() } } - //Now that all proc's are created and functional, we need to - //calculate the system EFFECTIVE_EC - calculateEffectiveEC(); + //Check all of the slave processor's EC levels to ensure they match master + //processor's EC level. + //function will return error log pointing to all error logs created + //by this function as this function could detect multiple procs w/ + //bad ECs and will make a log for each + errl = validateProcessorEcLevels(); + if (errl) + { + HWAS_ERR("discoverTargets: validateProcessorEcLevels failed"); + break; + } // Potentially reduce the number of ec/core units that are present // based on fused mode @@ -2856,50 +2864,120 @@ void setChipletGardsOnProc(TARGETING::Target * i_procTarget) i_procTarget->setAttr<TARGETING::ATTR_EC_GARD>(l_ecGard); }//setChipletGardsOnProc -void calculateEffectiveEC() +errlHndl_t validateProcessorEcLevels() { - HWAS_INF("calculateEffectiveEC entry"); - + HWAS_INF("validateProcessorEcLevels entry"); + errlHndl_t l_err = nullptr; + uint32_t l_commonPlid = 0; + TARGETING::ATTR_EC_type l_masterEc = 0; + TARGETING::ATTR_EC_type l_ecToCompare = 0; + TARGETING::ATTR_HUID_type l_masterHuid = 0; + TARGETING::TargetHandleList l_procChips; + Target* l_pMasterProc = NULL; do { - //true => FSP present. Only run this on non-FSP systems - TARGETING::Target * sys = NULL; - TARGETING::targetService().getTopLevelTarget( sys ); - TARGETING::SpFunctions spfuncs; - if( sys && - sys->tryGetAttr<TARGETING::ATTR_SP_FUNCTIONS>(spfuncs) && - spfuncs.baseServices ) + //Get all functional chips + getAllChips(l_procChips, TYPE_PROC); + + // check for functional Master Proc on this node + l_err = targetService().queryMasterProcChipTargetHandle(l_pMasterProc); + + //queryMasterProcChipTargetHandle will check for null, make sure + //there was no problem finding the master proc + if(l_err) { + HWAS_ERR( "validateProcessorEcLevels:: Unable to find master proc"); + //Don't commit the error just let it get returned from function break; } - //Get all functional chips - TARGETING::TargetHandleList l_procList; - getAllChips(l_procList, TYPE_PROC); + //Get master info and store it for comparing later + l_masterEc = l_pMasterProc->getAttr<TARGETING::ATTR_EC>(); + l_masterHuid = get_huid(l_pMasterProc); - //Assume lowest EC among all functional processor chips is 0xFF - TARGETING::ATTR_EC_type l_lowestEC = 0xFF; - - //Loop through all functional procs and find the lowest EC - for(TargetHandleList::const_iterator proc = l_procList.begin(); - proc != l_procList.end(); ++proc) + //Loop through all functional procs and create error logs + //for any processors whose EC does not match the master + for(const auto & l_chip : l_procChips) { - if((*proc)->getAttr<TARGETING::ATTR_EC>() < l_lowestEC) + l_ecToCompare = l_chip->getAttr<TARGETING::ATTR_EC>(); + if(l_ecToCompare != l_masterEc) { - l_lowestEC = (*proc)->getAttr<TARGETING::ATTR_EC>(); - } - } + HWAS_ERR("validateProcessorEcLevels:: Slave Proc EC level not does not match master, " + "this is an unrecoverable error.. system will shut down"); - HWAS_INF("Lowest functional proc chip EC = 0x%llx",l_lowestEC); + /*@ + * @errortype + * @severity ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_VALIDATE_EC_LEVELS + * @reasoncode RC_EC_MISMATCH + * @devdesc Found a slave processor whose EC level + * did not match the master + * @custdesc Incompatible Processor Chip Levels + * @userdata1[00:31] HUID of slave chip + * @userdata1[32:63] EC level of slave chip + * @userdata2[00:31] HUID of master chip + * @userdata2[32:63] EC level of master chip + */ + const uint64_t userdata1 = + (static_cast<uint64_t>(get_huid(l_chip)) << 32) | static_cast<uint64_t>(l_ecToCompare); + const uint64_t userdata2 = + (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc); - sys->setAttr<TARGETING::ATTR_EFFECTIVE_EC>(l_lowestEC); + l_err = hwasError(ERRL_SEV_UNRECOVERABLE, + MOD_VALIDATE_EC_LEVELS, + RC_EC_MISMATCH, + userdata1, + userdata2); + // call out the procedure to find the deconfigured part. + //TODO SW410022 Add HWSV support for platHwasErrorAddHWCallout +// platHwasErrorAddHWCallout( l_err, +// l_chip, +// SRCI_PRIORITY_HIGH, +// NO_DECONFIG, +// GARD_NULL); + // if we already have an error, link this one to the earlier; + // if not, set the common plid + hwasErrorUpdatePlid(l_err, l_commonPlid); + errlCommit(l_err, HWAS_COMP_ID); + //Do not break, we want to find all mismatches + } + } }while(0); - HWAS_INF("calculateEffectiveEC exit"); - return; + if(l_commonPlid) + { + HWAS_ERR("validateProcessorEcLevels:: One or more slave processor's EC level did not match master, check error logs"); + + /*@ + * @errortype + * @severity ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_VALIDATE_EC_LEVELS + * @reasoncode RC_FAILED_EC_VALIDATION + * @devdesc Found one or more slave processor whose EC level + * did not match the master + * @custdesc Incompatible Processor Chip Levels + * @userdata1[00:64] Number of Procs + */ + const uint64_t userdata1 = + static_cast<uint64_t>(l_procChips.size()); + const uint64_t userdata2 = + (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc); + + l_err = hwasError(ERRL_SEV_UNRECOVERABLE, + MOD_VALIDATE_EC_LEVELS, + RC_FAILED_EC_VALIDATION, + userdata1, + userdata2); + + // link this error to the earlier errors; + hwasErrorUpdatePlid(l_err, l_commonPlid); + } + + HWAS_INF("validateProcessorEcLevels exit"); + return l_err; -} //calculateEffectiveEC +} //validateProccesorEcLevels errlHndl_t markDisabledMcas() { diff --git a/src/usr/hwas/hwasPlatError.C b/src/usr/hwas/hwasPlatError.C index 716e9541b..8c53ccdb6 100644 --- a/src/usr/hwas/hwasPlatError.C +++ b/src/usr/hwas/hwasPlatError.C @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2017 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -57,6 +59,16 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl, i_priority); } +void platHwasErrorAddHWCallout(errlHndl_t & io_errl, + const TARGETING::ConstTargetHandle_t i_target, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardErrorType) +{ + io_errl->addHwCallout(i_target, i_priority, + i_deconfigState, i_gardErrorType); +} + void hwasErrorUpdatePlid(errlHndl_t & io_errl, uint32_t & io_plid) { |