diff options
author | Christian Geddes <crgeddes@us.ibm.com> | 2017-11-29 17:20:08 -0600 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2017-12-08 11:56:02 -0500 |
commit | 9f3429c045619b95782e01435c0ae760b6bfcf1b (patch) | |
tree | 7ba8f6b5f1a749994d1dfc4087dfaa5c08564321 | |
parent | ce376fc3f4c70df7dd804b860f339dc15792b01f (diff) | |
download | blackbird-hostboot-9f3429c045619b95782e01435c0ae760b6bfcf1b.tar.gz blackbird-hostboot-9f3429c045619b95782e01435c0ae760b6bfcf1b.zip |
TI w/ unrecoverable error if system boots w/ mismatching processors
We do not support processor modules w/ different EC levels to be
on the same system. The rule is that all processors must match
the master processor. If this is not true then hostboot will TI.
Techically we could probably still boot and just gard out the
non-matching proc's but the use case for this will be lab/manufac.
For those cases we want the technician to know right away that they
put a module w/ the wrong DD level in a system.
RTC: 183243
Change-Id: I09b30550edd30c8523fd2a709c7b7a83f597eab8
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/50167
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Martin Gloff <mgloff@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
-rw-r--r-- | src/include/usr/hwas/common/hwas.H | 11 | ||||
-rw-r--r-- | src/include/usr/hwas/common/hwasCommon.H | 11 | ||||
-rw-r--r-- | src/include/usr/hwas/common/hwas_reasoncodes.H | 3 | ||||
-rw-r--r-- | src/usr/hwas/common/hwas.C | 140 | ||||
-rw-r--r-- | src/usr/hwas/hwasPlatError.C | 14 |
5 files changed, 147 insertions, 32 deletions
diff --git a/src/include/usr/hwas/common/hwas.H b/src/include/usr/hwas/common/hwas.H index 138a51887..7289b8ecb 100644 --- a/src/include/usr/hwas/common/hwas.H +++ b/src/include/usr/hwas/common/hwas.H @@ -138,6 +138,17 @@ errlHndl_t checkMinimumHardware( bool *o_bootable = NULL); /** +* @brief Loop through processors, make sure all have the same EC level +* create an error log for any slave processor that does not match +* the master's EC level +* +* +* @return errlHndl_t Error returned will be a summary of all errors that +* occurred during the procedure, all PLIDs should match +*/ +errlHndl_t validateProcessorEcLevels(); + +/** * @brief Struct representing a particular target. Used by * invokePresentByAssoc to populate a vector of TargetInfo's for subsequent * use by deconfigPresentByAssoc diff --git a/src/include/usr/hwas/common/hwasCommon.H b/src/include/usr/hwas/common/hwasCommon.H index 7462571b4..946d2c7a8 100644 --- a/src/include/usr/hwas/common/hwasCommon.H +++ b/src/include/usr/hwas/common/hwasCommon.H @@ -348,6 +348,17 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl, const HWAS::callOutPriority i_priority); /** + * @brief wrapper function to add a procedure callout to an error log in a + * platform-specific manner. + * + */ +void platHwasErrorAddHWCallout(errlHndl_t & io_errl, + const TARGETING::ConstTargetHandle_t i_target, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardErrorType); + +/** * @brief wrapper function to update the plid in a platform-specific manner. * * If io_plid is non-zero then io_errl is updated with io_plid diff --git a/src/include/usr/hwas/common/hwas_reasoncodes.H b/src/include/usr/hwas/common/hwas_reasoncodes.H index 729743162..09d90a796 100644 --- a/src/include/usr/hwas/common/hwas_reasoncodes.H +++ b/src/include/usr/hwas/common/hwas_reasoncodes.H @@ -33,6 +33,7 @@ namespace HWAS MOD_PROCESS_CALLOUT = 0x02, MOD_CHECK_MIN_HW = 0x03, MOD_DECONFIG_TARGETS_FROM_GARD = 0x04, + MOD_VALIDATE_EC_LEVELS = 0x05, }; enum HwasReasonCode @@ -55,6 +56,8 @@ namespace HWAS RC_SYSAVAIL_MISSING_CRITICAL_RESOURCE = HWAS_COMP_ID | 0x0B, RC_SYSAVAIL_NO_MCAS_FUNC = HWAS_COMP_ID | 0x0C, RC_SYSAVAIL_NO_NX_FUNC = HWAS_COMP_ID | 0x0E, + RC_EC_MISMATCH = HWAS_COMP_ID | 0x0F, + RC_FAILED_EC_VALIDATION = HWAS_COMP_ID | 0x10, }; }; diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C index 30f3f8993..0ea59b4b4 100644 --- a/src/usr/hwas/common/hwas.C +++ b/src/usr/hwas/common/hwas.C @@ -445,9 +445,17 @@ errlHndl_t discoverTargets() } } - //Now that all proc's are created and functional, we need to - //calculate the system EFFECTIVE_EC - calculateEffectiveEC(); + //Check all of the slave processor's EC levels to ensure they match master + //processor's EC level. + //function will return error log pointing to all error logs created + //by this function as this function could detect multiple procs w/ + //bad ECs and will make a log for each + errl = validateProcessorEcLevels(); + if (errl) + { + HWAS_ERR("discoverTargets: validateProcessorEcLevels failed"); + break; + } // Potentially reduce the number of ec/core units that are present // based on fused mode @@ -2856,50 +2864,120 @@ void setChipletGardsOnProc(TARGETING::Target * i_procTarget) i_procTarget->setAttr<TARGETING::ATTR_EC_GARD>(l_ecGard); }//setChipletGardsOnProc -void calculateEffectiveEC() +errlHndl_t validateProcessorEcLevels() { - HWAS_INF("calculateEffectiveEC entry"); - + HWAS_INF("validateProcessorEcLevels entry"); + errlHndl_t l_err = nullptr; + uint32_t l_commonPlid = 0; + TARGETING::ATTR_EC_type l_masterEc = 0; + TARGETING::ATTR_EC_type l_ecToCompare = 0; + TARGETING::ATTR_HUID_type l_masterHuid = 0; + TARGETING::TargetHandleList l_procChips; + Target* l_pMasterProc = NULL; do { - //true => FSP present. Only run this on non-FSP systems - TARGETING::Target * sys = NULL; - TARGETING::targetService().getTopLevelTarget( sys ); - TARGETING::SpFunctions spfuncs; - if( sys && - sys->tryGetAttr<TARGETING::ATTR_SP_FUNCTIONS>(spfuncs) && - spfuncs.baseServices ) + //Get all functional chips + getAllChips(l_procChips, TYPE_PROC); + + // check for functional Master Proc on this node + l_err = targetService().queryMasterProcChipTargetHandle(l_pMasterProc); + + //queryMasterProcChipTargetHandle will check for null, make sure + //there was no problem finding the master proc + if(l_err) { + HWAS_ERR( "validateProcessorEcLevels:: Unable to find master proc"); + //Don't commit the error just let it get returned from function break; } - //Get all functional chips - TARGETING::TargetHandleList l_procList; - getAllChips(l_procList, TYPE_PROC); + //Get master info and store it for comparing later + l_masterEc = l_pMasterProc->getAttr<TARGETING::ATTR_EC>(); + l_masterHuid = get_huid(l_pMasterProc); - //Assume lowest EC among all functional processor chips is 0xFF - TARGETING::ATTR_EC_type l_lowestEC = 0xFF; - - //Loop through all functional procs and find the lowest EC - for(TargetHandleList::const_iterator proc = l_procList.begin(); - proc != l_procList.end(); ++proc) + //Loop through all functional procs and create error logs + //for any processors whose EC does not match the master + for(const auto & l_chip : l_procChips) { - if((*proc)->getAttr<TARGETING::ATTR_EC>() < l_lowestEC) + l_ecToCompare = l_chip->getAttr<TARGETING::ATTR_EC>(); + if(l_ecToCompare != l_masterEc) { - l_lowestEC = (*proc)->getAttr<TARGETING::ATTR_EC>(); - } - } + HWAS_ERR("validateProcessorEcLevels:: Slave Proc EC level not does not match master, " + "this is an unrecoverable error.. system will shut down"); - HWAS_INF("Lowest functional proc chip EC = 0x%llx",l_lowestEC); + /*@ + * @errortype + * @severity ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_VALIDATE_EC_LEVELS + * @reasoncode RC_EC_MISMATCH + * @devdesc Found a slave processor whose EC level + * did not match the master + * @custdesc Incompatible Processor Chip Levels + * @userdata1[00:31] HUID of slave chip + * @userdata1[32:63] EC level of slave chip + * @userdata2[00:31] HUID of master chip + * @userdata2[32:63] EC level of master chip + */ + const uint64_t userdata1 = + (static_cast<uint64_t>(get_huid(l_chip)) << 32) | static_cast<uint64_t>(l_ecToCompare); + const uint64_t userdata2 = + (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc); - sys->setAttr<TARGETING::ATTR_EFFECTIVE_EC>(l_lowestEC); + l_err = hwasError(ERRL_SEV_UNRECOVERABLE, + MOD_VALIDATE_EC_LEVELS, + RC_EC_MISMATCH, + userdata1, + userdata2); + // call out the procedure to find the deconfigured part. + //TODO SW410022 Add HWSV support for platHwasErrorAddHWCallout +// platHwasErrorAddHWCallout( l_err, +// l_chip, +// SRCI_PRIORITY_HIGH, +// NO_DECONFIG, +// GARD_NULL); + // if we already have an error, link this one to the earlier; + // if not, set the common plid + hwasErrorUpdatePlid(l_err, l_commonPlid); + errlCommit(l_err, HWAS_COMP_ID); + //Do not break, we want to find all mismatches + } + } }while(0); - HWAS_INF("calculateEffectiveEC exit"); - return; + if(l_commonPlid) + { + HWAS_ERR("validateProcessorEcLevels:: One or more slave processor's EC level did not match master, check error logs"); + + /*@ + * @errortype + * @severity ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_VALIDATE_EC_LEVELS + * @reasoncode RC_FAILED_EC_VALIDATION + * @devdesc Found one or more slave processor whose EC level + * did not match the master + * @custdesc Incompatible Processor Chip Levels + * @userdata1[00:64] Number of Procs + */ + const uint64_t userdata1 = + static_cast<uint64_t>(l_procChips.size()); + const uint64_t userdata2 = + (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc); + + l_err = hwasError(ERRL_SEV_UNRECOVERABLE, + MOD_VALIDATE_EC_LEVELS, + RC_FAILED_EC_VALIDATION, + userdata1, + userdata2); + + // link this error to the earlier errors; + hwasErrorUpdatePlid(l_err, l_commonPlid); + } + + HWAS_INF("validateProcessorEcLevels exit"); + return l_err; -} //calculateEffectiveEC +} //validateProccesorEcLevels errlHndl_t markDisabledMcas() { diff --git a/src/usr/hwas/hwasPlatError.C b/src/usr/hwas/hwasPlatError.C index 716e9541b..8c53ccdb6 100644 --- a/src/usr/hwas/hwasPlatError.C +++ b/src/usr/hwas/hwasPlatError.C @@ -5,7 +5,9 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* COPYRIGHT International Business Machines Corp. 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2017 */ +/* [+] International Business Machines Corp. */ +/* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ @@ -57,6 +59,16 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl, i_priority); } +void platHwasErrorAddHWCallout(errlHndl_t & io_errl, + const TARGETING::ConstTargetHandle_t i_target, + const HWAS::callOutPriority i_priority, + const HWAS::DeconfigEnum i_deconfigState, + const HWAS::GARD_ErrorType i_gardErrorType) +{ + io_errl->addHwCallout(i_target, i_priority, + i_deconfigState, i_gardErrorType); +} + void hwasErrorUpdatePlid(errlHndl_t & io_errl, uint32_t & io_plid) { |