summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/include/usr/hwas/common/hwas.H11
-rw-r--r--src/include/usr/hwas/common/hwasCommon.H11
-rw-r--r--src/include/usr/hwas/common/hwas_reasoncodes.H3
-rw-r--r--src/usr/hwas/common/hwas.C140
-rw-r--r--src/usr/hwas/hwasPlatError.C14
5 files changed, 147 insertions, 32 deletions
diff --git a/src/include/usr/hwas/common/hwas.H b/src/include/usr/hwas/common/hwas.H
index 138a51887..7289b8ecb 100644
--- a/src/include/usr/hwas/common/hwas.H
+++ b/src/include/usr/hwas/common/hwas.H
@@ -138,6 +138,17 @@ errlHndl_t checkMinimumHardware(
bool *o_bootable = NULL);
/**
+* @brief Loop through processors, make sure all have the same EC level
+* create an error log for any slave processor that does not match
+* the master's EC level
+*
+*
+* @return errlHndl_t Error returned will be a summary of all errors that
+* occurred during the procedure, all PLIDs should match
+*/
+errlHndl_t validateProcessorEcLevels();
+
+/**
* @brief Struct representing a particular target. Used by
* invokePresentByAssoc to populate a vector of TargetInfo's for subsequent
* use by deconfigPresentByAssoc
diff --git a/src/include/usr/hwas/common/hwasCommon.H b/src/include/usr/hwas/common/hwasCommon.H
index 7462571b4..946d2c7a8 100644
--- a/src/include/usr/hwas/common/hwasCommon.H
+++ b/src/include/usr/hwas/common/hwasCommon.H
@@ -348,6 +348,17 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl,
const HWAS::callOutPriority i_priority);
/**
+ * @brief wrapper function to add a procedure callout to an error log in a
+ * platform-specific manner.
+ *
+ */
+void platHwasErrorAddHWCallout(errlHndl_t & io_errl,
+ const TARGETING::ConstTargetHandle_t i_target,
+ const HWAS::callOutPriority i_priority,
+ const HWAS::DeconfigEnum i_deconfigState,
+ const HWAS::GARD_ErrorType i_gardErrorType);
+
+/**
* @brief wrapper function to update the plid in a platform-specific manner.
*
* If io_plid is non-zero then io_errl is updated with io_plid
diff --git a/src/include/usr/hwas/common/hwas_reasoncodes.H b/src/include/usr/hwas/common/hwas_reasoncodes.H
index 729743162..09d90a796 100644
--- a/src/include/usr/hwas/common/hwas_reasoncodes.H
+++ b/src/include/usr/hwas/common/hwas_reasoncodes.H
@@ -33,6 +33,7 @@ namespace HWAS
MOD_PROCESS_CALLOUT = 0x02,
MOD_CHECK_MIN_HW = 0x03,
MOD_DECONFIG_TARGETS_FROM_GARD = 0x04,
+ MOD_VALIDATE_EC_LEVELS = 0x05,
};
enum HwasReasonCode
@@ -55,6 +56,8 @@ namespace HWAS
RC_SYSAVAIL_MISSING_CRITICAL_RESOURCE = HWAS_COMP_ID | 0x0B,
RC_SYSAVAIL_NO_MCAS_FUNC = HWAS_COMP_ID | 0x0C,
RC_SYSAVAIL_NO_NX_FUNC = HWAS_COMP_ID | 0x0E,
+ RC_EC_MISMATCH = HWAS_COMP_ID | 0x0F,
+ RC_FAILED_EC_VALIDATION = HWAS_COMP_ID | 0x10,
};
};
diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C
index 30f3f8993..0ea59b4b4 100644
--- a/src/usr/hwas/common/hwas.C
+++ b/src/usr/hwas/common/hwas.C
@@ -445,9 +445,17 @@ errlHndl_t discoverTargets()
}
}
- //Now that all proc's are created and functional, we need to
- //calculate the system EFFECTIVE_EC
- calculateEffectiveEC();
+ //Check all of the slave processor's EC levels to ensure they match master
+ //processor's EC level.
+ //function will return error log pointing to all error logs created
+ //by this function as this function could detect multiple procs w/
+ //bad ECs and will make a log for each
+ errl = validateProcessorEcLevels();
+ if (errl)
+ {
+ HWAS_ERR("discoverTargets: validateProcessorEcLevels failed");
+ break;
+ }
// Potentially reduce the number of ec/core units that are present
// based on fused mode
@@ -2856,50 +2864,120 @@ void setChipletGardsOnProc(TARGETING::Target * i_procTarget)
i_procTarget->setAttr<TARGETING::ATTR_EC_GARD>(l_ecGard);
}//setChipletGardsOnProc
-void calculateEffectiveEC()
+errlHndl_t validateProcessorEcLevels()
{
- HWAS_INF("calculateEffectiveEC entry");
-
+ HWAS_INF("validateProcessorEcLevels entry");
+ errlHndl_t l_err = nullptr;
+ uint32_t l_commonPlid = 0;
+ TARGETING::ATTR_EC_type l_masterEc = 0;
+ TARGETING::ATTR_EC_type l_ecToCompare = 0;
+ TARGETING::ATTR_HUID_type l_masterHuid = 0;
+ TARGETING::TargetHandleList l_procChips;
+ Target* l_pMasterProc = NULL;
do
{
- //true => FSP present. Only run this on non-FSP systems
- TARGETING::Target * sys = NULL;
- TARGETING::targetService().getTopLevelTarget( sys );
- TARGETING::SpFunctions spfuncs;
- if( sys &&
- sys->tryGetAttr<TARGETING::ATTR_SP_FUNCTIONS>(spfuncs) &&
- spfuncs.baseServices )
+ //Get all functional chips
+ getAllChips(l_procChips, TYPE_PROC);
+
+ // check for functional Master Proc on this node
+ l_err = targetService().queryMasterProcChipTargetHandle(l_pMasterProc);
+
+ //queryMasterProcChipTargetHandle will check for null, make sure
+ //there was no problem finding the master proc
+ if(l_err)
{
+ HWAS_ERR( "validateProcessorEcLevels:: Unable to find master proc");
+ //Don't commit the error just let it get returned from function
break;
}
- //Get all functional chips
- TARGETING::TargetHandleList l_procList;
- getAllChips(l_procList, TYPE_PROC);
+ //Get master info and store it for comparing later
+ l_masterEc = l_pMasterProc->getAttr<TARGETING::ATTR_EC>();
+ l_masterHuid = get_huid(l_pMasterProc);
- //Assume lowest EC among all functional processor chips is 0xFF
- TARGETING::ATTR_EC_type l_lowestEC = 0xFF;
-
- //Loop through all functional procs and find the lowest EC
- for(TargetHandleList::const_iterator proc = l_procList.begin();
- proc != l_procList.end(); ++proc)
+ //Loop through all functional procs and create error logs
+ //for any processors whose EC does not match the master
+ for(const auto & l_chip : l_procChips)
{
- if((*proc)->getAttr<TARGETING::ATTR_EC>() < l_lowestEC)
+ l_ecToCompare = l_chip->getAttr<TARGETING::ATTR_EC>();
+ if(l_ecToCompare != l_masterEc)
{
- l_lowestEC = (*proc)->getAttr<TARGETING::ATTR_EC>();
- }
- }
+ HWAS_ERR("validateProcessorEcLevels:: Slave Proc EC level not does not match master, "
+ "this is an unrecoverable error.. system will shut down");
- HWAS_INF("Lowest functional proc chip EC = 0x%llx",l_lowestEC);
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_UNRECOVERABLE
+ * @moduleid MOD_VALIDATE_EC_LEVELS
+ * @reasoncode RC_EC_MISMATCH
+ * @devdesc Found a slave processor whose EC level
+ * did not match the master
+ * @custdesc Incompatible Processor Chip Levels
+ * @userdata1[00:31] HUID of slave chip
+ * @userdata1[32:63] EC level of slave chip
+ * @userdata2[00:31] HUID of master chip
+ * @userdata2[32:63] EC level of master chip
+ */
+ const uint64_t userdata1 =
+ (static_cast<uint64_t>(get_huid(l_chip)) << 32) | static_cast<uint64_t>(l_ecToCompare);
+ const uint64_t userdata2 =
+ (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc);
- sys->setAttr<TARGETING::ATTR_EFFECTIVE_EC>(l_lowestEC);
+ l_err = hwasError(ERRL_SEV_UNRECOVERABLE,
+ MOD_VALIDATE_EC_LEVELS,
+ RC_EC_MISMATCH,
+ userdata1,
+ userdata2);
+ // call out the procedure to find the deconfigured part.
+ //TODO SW410022 Add HWSV support for platHwasErrorAddHWCallout
+// platHwasErrorAddHWCallout( l_err,
+// l_chip,
+// SRCI_PRIORITY_HIGH,
+// NO_DECONFIG,
+// GARD_NULL);
+ // if we already have an error, link this one to the earlier;
+ // if not, set the common plid
+ hwasErrorUpdatePlid(l_err, l_commonPlid);
+ errlCommit(l_err, HWAS_COMP_ID);
+ //Do not break, we want to find all mismatches
+ }
+ }
}while(0);
- HWAS_INF("calculateEffectiveEC exit");
- return;
+ if(l_commonPlid)
+ {
+ HWAS_ERR("validateProcessorEcLevels:: One or more slave processor's EC level did not match master, check error logs");
+
+ /*@
+ * @errortype
+ * @severity ERRL_SEV_UNRECOVERABLE
+ * @moduleid MOD_VALIDATE_EC_LEVELS
+ * @reasoncode RC_FAILED_EC_VALIDATION
+ * @devdesc Found one or more slave processor whose EC level
+ * did not match the master
+ * @custdesc Incompatible Processor Chip Levels
+ * @userdata1[00:64] Number of Procs
+ */
+ const uint64_t userdata1 =
+ static_cast<uint64_t>(l_procChips.size());
+ const uint64_t userdata2 =
+ (static_cast<uint64_t>(l_masterHuid) << 32) | static_cast<uint64_t>(l_masterEc);
+
+ l_err = hwasError(ERRL_SEV_UNRECOVERABLE,
+ MOD_VALIDATE_EC_LEVELS,
+ RC_FAILED_EC_VALIDATION,
+ userdata1,
+ userdata2);
+
+ // link this error to the earlier errors;
+ hwasErrorUpdatePlid(l_err, l_commonPlid);
+ }
+
+ HWAS_INF("validateProcessorEcLevels exit");
+ return l_err;
-} //calculateEffectiveEC
+} //validateProccesorEcLevels
errlHndl_t markDisabledMcas()
{
diff --git a/src/usr/hwas/hwasPlatError.C b/src/usr/hwas/hwasPlatError.C
index 716e9541b..8c53ccdb6 100644
--- a/src/usr/hwas/hwasPlatError.C
+++ b/src/usr/hwas/hwasPlatError.C
@@ -5,7 +5,9 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* COPYRIGHT International Business Machines Corp. 2012,2014 */
+/* Contributors Listed Below - COPYRIGHT 2012,2017 */
+/* [+] International Business Machines Corp. */
+/* */
/* */
/* Licensed under the Apache License, Version 2.0 (the "License"); */
/* you may not use this file except in compliance with the License. */
@@ -57,6 +59,16 @@ void hwasErrorAddProcedureCallout(errlHndl_t & io_errl,
i_priority);
}
+void platHwasErrorAddHWCallout(errlHndl_t & io_errl,
+ const TARGETING::ConstTargetHandle_t i_target,
+ const HWAS::callOutPriority i_priority,
+ const HWAS::DeconfigEnum i_deconfigState,
+ const HWAS::GARD_ErrorType i_gardErrorType)
+{
+ io_errl->addHwCallout(i_target, i_priority,
+ i_deconfigState, i_gardErrorType);
+}
+
void hwasErrorUpdatePlid(errlHndl_t & io_errl,
uint32_t & io_plid)
{
OpenPOWER on IntegriCloud