summaryrefslogtreecommitdiffstats
path: root/src/usr/hwas/common
diff options
context:
space:
mode:
authorChristian Geddes <crgeddes@us.ibm.com>2017-11-29 17:20:08 -0600
committerDaniel M. Crowell <dcrowell@us.ibm.com>2017-12-08 11:56:02 -0500
commit9f3429c045619b95782e01435c0ae760b6bfcf1b (patch)
tree7ba8f6b5f1a749994d1dfc4087dfaa5c08564321 /src/usr/hwas/common
parentce376fc3f4c70df7dd804b860f339dc15792b01f (diff)
downloadtalos-hostboot-9f3429c045619b95782e01435c0ae760b6bfcf1b.tar.gz
talos-hostboot-9f3429c045619b95782e01435c0ae760b6bfcf1b.zip
TI w/ unrecoverable error if system boots w/ mismatching processors
We do not support processor modules w/ different EC levels to be on the same system. The rule is that all processors must match the master processor. If this is not true then hostboot will TI. Techically we could probably still boot and just gard out the non-matching proc's but the use case for this will be lab/manufac. For those cases we want the technician to know right away that they put a module w/ the wrong DD level in a system. RTC: 183243 Change-Id: I09b30550edd30c8523fd2a709c7b7a83f597eab8 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/50167 Reviewed-by: Matt Derksen <mderkse1@us.ibm.com> Reviewed-by: Martin Gloff <mgloff@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/hwas/common')
-rw-r--r--src/usr/hwas/common/hwas.C140
1 files changed, 109 insertions, 31 deletions
diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C
index 30f3f8993..0ea59b4b4 100644
--- a/src/usr/hwas/common/hwas.C
+++ b/src/usr/hwas/common/hwas.C
@@ -445,9 +445,17 @@ errlHndl_t discoverTargets()
}
}
- //Now that all proc's are created and functional, we need to
- //calculate the system EFFECTIVE_EC
- calculateEffectiveEC();
+ //Check all of the slave processor's EC levels to ensure they match master
+ //processor's EC level.
+ //function will return error log pointing to all error logs created
+ //by this function as this function could detect multiple procs w/
+ //bad ECs and will make a log for each
+ errl = validateProcessorEcLevels();
+ if (errl)
+ {
+ HWAS_ERR("discoverTargets: validateProcessorEcLevels failed");
+ break;
+ }
// Potentially reduce the number of ec/core units that are present
// based on fused mode
@@ -2856,50 +2864,120 @@ void setChipletGardsOnProc(TARGETING::Target * i_procTarget)
i_procTarget->setAttr<TARGETING::ATTR_EC_GARD>(l_ecGard);
}//setChipletGardsOnProc
-void calculateEffectiveEC()
+errlHndl_t validateProcessorEcLevels()
{
- HWAS_INF("calculateEffectiveEC entry");
-
+ HWAS_INF("validateProcessorEcLevels entry");
+ errlHndl_t l_err = nullptr;
+ uint32_t l_commonPlid = 0;
+ TARGETING::ATTR_EC_type l_masterEc = 0;
+ TARGETING::ATTR_EC_type l_ecToCompare = 0;
+ TARGETING::ATTR_HUID_type l_masterHuid = 0;
+ TARGETING::TargetHandleList l_procChips;
+ Target* l_pMasterProc = NULL;
do
{
- //true => FSP present. Only run this on non-FSP systems
- TARGETING::Target * sys = NULL;
- TARGETING::targetService().getTopLevelTarget( sys );
- TARGETING::SpFunctions spfuncs;
- if( sys &&
- sys->tryGetAttr<TARGETING::ATTR_SP_FUNCTIONS>(spfuncs) &&
- spfuncs.baseServices )
+ //Get all functional chips
+ getAllChips(l_procChips, TYPE_PROC);
+
+ // check for functional Master Proc on this node
+ l_err = targetService().queryMasterProcChipTargetHandle(l_pMasterProc);
+
+ //queryMasterProcChipTargetHandle will check for null, make sure
+ //there was no problem finding the master proc
+ if(l_err)
{
+ HWAS_ERR( "validateProcessorEcLevels:: Unable to find master proc");
+ //Don't commit the error just let it get returned from function
break;
}
- //Get all functional chips
- TARGETING::TargetHandleList l_procList;
- getAllChips(l_procList, TYPE_PROC);
+ //Get master info and store it for comparing later
+ l_masterEc = l_pMasterProc->getAttr<TARGETING::ATTR_EC>();
+ l_masterHuid = get_huid(l_pMasterProc);
- //Assume lowest EC among all functional processor chips is 0xFF
- TARGETING::ATTR_EC_type l_lowestEC = 0xFF;
-
- //Loop through all functional procs and find the lowest EC
- for(TargetHandleList::const_iterator proc = l_procList.begin();
- proc != l_procList.end(); ++proc)
+ //Loop through all functional procs and create error logs
+ //for any processors whose EC does not match the master
+ for(const auto & l_chip : l_procChips)
{
- if((*proc)->getAttr<TARGETING::ATTR_EC>() < l_lowestEC)
+ l_ecToCompare = l_chip->getAttr<TARGETING::ATTR_EC>();
+ if(l_ecToCompare != l_masterEc)
{
- l_lowestEC = (*proc)->getAttr<TARGETING::ATTR_EC>();
- }
- }
+ HWAS_ERR("validateProcessorEcLevels:: Slave Proc EC level not does not match master, "
+ "this is an unrecoverable error.. system will shut down");
- HWAS_INF("Lowest functional proc chip EC = 0x%llx",l_lowestEC);
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_UNRECOVERABLE
+ * @moduleid MOD_VALIDATE_EC_LEVELS
+ * @reasoncode RC_EC_MISMATCH
+ * @devdesc Found a slave processor whose EC level
+ * did not match the master
+ * @custdesc Incompatible Processor Chip Levels
+ * @userdata1[00:31] HUID of slave chip
+ * @userdata1[32:63] EC level of slave chip
+ * @userdata2[00:31] HUID of master chip
+ * @userdata2[32:63] EC level of master chip
+ */
+ const uint64_t userdata1 =
+ (static_cast<uint64_t>(get_huid(l_chip)) << 32) | static_cast<uint64_t>(l_ecToCompare);
+ const uint64_t userdata2 =
+ (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc);
- sys->setAttr<TARGETING::ATTR_EFFECTIVE_EC>(l_lowestEC);
+ l_err = hwasError(ERRL_SEV_UNRECOVERABLE,
+ MOD_VALIDATE_EC_LEVELS,
+ RC_EC_MISMATCH,
+ userdata1,
+ userdata2);
+ // call out the procedure to find the deconfigured part.
+ //TODO SW410022 Add HWSV support for platHwasErrorAddHWCallout
+// platHwasErrorAddHWCallout( l_err,
+// l_chip,
+// SRCI_PRIORITY_HIGH,
+// NO_DECONFIG,
+// GARD_NULL);
+ // if we already have an error, link this one to the earlier;
+ // if not, set the common plid
+ hwasErrorUpdatePlid(l_err, l_commonPlid);
+ errlCommit(l_err, HWAS_COMP_ID);
+ //Do not break, we want to find all mismatches
+ }
+ }
}while(0);
- HWAS_INF("calculateEffectiveEC exit");
- return;
+ if(l_commonPlid)
+ {
+ HWAS_ERR("validateProcessorEcLevels:: One or more slave processor's EC level did not match master, check error logs");
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_UNRECOVERABLE
+ * @moduleid MOD_VALIDATE_EC_LEVELS
+ * @reasoncode RC_FAILED_EC_VALIDATION
+ * @devdesc Found one or more slave processor whose EC level
+ * did not match the master
+ * @custdesc Incompatible Processor Chip Levels
+ * @userdata1[00:64] Number of Procs
+ */
+ const uint64_t userdata1 =
+ static_cast<uint64_t>(l_procChips.size());
+ const uint64_t userdata2 =
+ (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc);
+
+ l_err = hwasError(ERRL_SEV_UNRECOVERABLE,
+ MOD_VALIDATE_EC_LEVELS,
+ RC_FAILED_EC_VALIDATION,
+ userdata1,
+ userdata2);
+
+ // link this error to the earlier errors;
+ hwasErrorUpdatePlid(l_err, l_commonPlid);
+ }
+
+ HWAS_INF("validateProcessorEcLevels exit");
+ return l_err;
-} //calculateEffectiveEC
+} //validateProccesorEcLevels
errlHndl_t markDisabledMcas()
{
OpenPOWER on IntegriCloud