diff options
author | Prachi Gupta <pragupta@us.ibm.com> | 2018-05-21 15:29:40 -0500 |
---|---|---|
committer | William G. Hoffa <wghoffa@us.ibm.com> | 2018-06-02 11:59:58 -0400 |
commit | 5815703c3be9f8830011f573a719e69553cb1b94 (patch) | |
tree | 46a7c93cb82a1736d52c7bef338519a444724677 /src/usr | |
parent | c6916a42d34bdd1c9502056740ec3a819c082099 (diff) | |
download | talos-hostboot-5815703c3be9f8830011f573a719e69553cb1b94.tar.gz talos-hostboot-5815703c3be9f8830011f573a719e69553cb1b94.zip |
Add support for missing memory behind master proc
On a phyp based system, when we detect that memory is missing
behind master processor, we find a proc with memory and set
ATTR_PROC_MEM_TO_USE to its HRMOR. This commit adds this support
in hwas common code as HWSV will call this function, update SBE
mbox registers, and IPL the system.
Change-Id: I88a6cb69aa10147365c556f9cf31014066bd3d08
CQ:SW430015
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59159
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Reviewed-by: Sachin Gupta <sgupta2m@in.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William G. Hoffa <wghoffa@us.ibm.com>
Diffstat (limited to 'src/usr')
-rw-r--r-- | src/usr/hwas/common/hwas.C | 235 | ||||
-rw-r--r-- | src/usr/isteps/istep07/call_mss_attr_update.C | 222 |
2 files changed, 305 insertions, 152 deletions
diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C index a6fa872b5..c00d8cf78 100644 --- a/src/usr/hwas/common/hwas.C +++ b/src/usr/hwas/common/hwas.C @@ -87,6 +87,24 @@ bool compareAffinity(const TargetInfo t1, const TargetInfo t2) return t1.affinityPath < t2.affinityPath; } +/* + * @brief This function takes in proc target and returns group/chip id + * in the following bit format: GGGG CCC + * where G = Group Id and C = Chip Id + * + * @param[in] i_proc: proc target + * @retval: chip info including group and chip id + */ +uint64_t getGroupChipIdInfo (TargetHandle_t i_proc) +{ + auto l_grp_id = i_proc->getAttr<ATTR_FABRIC_GROUP_ID>(); + auto l_chip_id = i_proc->getAttr<ATTR_FABRIC_CHIP_ID>(); + + //Chip ID is three bits long, therefore, shift group id + //by 3 and OR it with chip id + return ((l_grp_id << 3) | l_chip_id); +} + /** * @brief simple helper fn to get and set hwas state to poweredOn, * present, functional @@ -290,6 +308,223 @@ errlHndl_t disableOBUSes() return l_err; } +errlHndl_t update_proc_mem_to_use (const Target* i_node) +{ + errlHndl_t l_errl {nullptr}; + TargetHandle_t l_masterProcTarget {nullptr}; + + do + { + //Get master proc + l_errl = + targetService().queryMasterProcChipTargetHandle(l_masterProcTarget, + i_node); + if (l_errl) + { + HWAS_ERR("update_proc_mem_to_use: unable to get master proc"); + break; + } + + + //Check if this processor has missing memory + //If yes, then get the HRMOR of the proc we want to use the mem of + uint8_t l_proc_mem_to_use = l_masterProcTarget->getAttr + <ATTR_PROC_MEM_TO_USE>(); + uint8_t l_proc_mem_to_use_save = l_proc_mem_to_use; + bool l_found_missing_mem = false; + l_errl = check_for_missing_memory(i_node, l_proc_mem_to_use, + l_found_missing_mem); + if (l_errl) + { + HWAS_ERR("update_proc_mem_to_use: unable to check for missing mem"); + break; + } + + //We found missing memory behind master proc, but + //check_for_missing_memory didn't update proc_mem_to_use + //probably because there are no other procs with memory, + //create an error. + if (l_found_missing_mem && (l_proc_mem_to_use==l_proc_mem_to_use_save)) + { + HWAS_ERR("update_proc_mem_to_use: ATTR_PROC_MEM_TO_USE didn't get" + " updated even though we were missing mem behind master proc"); + + /*@ + * @errortype + * @severity ERRL_SEV_UNRECOVERABLE + * @moduleid MOD_UPDATE_PROC_MEM_TO_USE + * @reasoncode RC_NO_UPDATE_WHEN_MEM_MISSING + * @devdesc No procs found with valid memory + * @custdesc A problem occurred during the IPL of + * the system: No memory found + * @userdata1 Saved value of ATTR_PROC_MEM_TO_USE + * @userdata2 Updated value of ATTR_PROC_MEM_TO_USE + */ + l_errl = hwasError(ERRL_SEV_UNRECOVERABLE, + MOD_UPDATE_PROC_MEM_TO_USE, + RC_NO_UPDATE_WHEN_MEM_MISSING, + l_proc_mem_to_use_save, + l_proc_mem_to_use); + + hwasErrorAddProcedureCallout(l_errl, + EPUB_PRC_FIND_DECONFIGURED_PART, + SRCI_PRIORITY_HIGH); + break; + + } + + //set PROC_MEM_TO_USE to the group/chip id of the proc we want to + //use the mem of + //get all procs behind the input node + TargetHandleList l_procs; + getChildAffinityTargetsByState( l_procs, + i_node, + CLASS_CHIP, + TYPE_PROC, + UTIL_FILTER_ALL); + for (auto & l_proc : l_procs) + { + l_proc->setAttr<ATTR_PROC_MEM_TO_USE>(l_proc_mem_to_use); + } + + } while (0); + + return l_errl; +} + +errlHndl_t check_for_missing_memory (const Target* i_node, + uint8_t & io_proc_mem_to_use, + bool & o_found_missing_mem) +{ + + errlHndl_t l_errl {nullptr}; + o_found_missing_mem = true; + + do + { + ///////////////////////////////////////////////////////////// + //Step 1 -- Figure out the lowest group/chip id proc that has + // memory + ///////////////////////////////////////////////////////////// + //get all procs behind the input node + TargetHandleList l_procs; + getChildAffinityTargetsByState( l_procs, + i_node, + CLASS_CHIP, + TYPE_PROC, + UTIL_FILTER_FUNCTIONAL); + + //sort based on group/chip id. So, we can deterministically + //pick the processor with memory. This will also help guarantee + //that we will attempt to use master (or altmaster) proc's memory + //first before using slave proc's memory. + std::sort(l_procs.begin(), l_procs.end(), + [] (TargetHandle_t a, TargetHandle_t b) + { + return getGroupChipIdInfo(a) < getGroupChipIdInfo(b); + }); + + uint8_t l_temp_proc_mem_to_use = io_proc_mem_to_use; + + //find a processor that has dimms + for (auto & l_proc : l_procs) + { + TargetHandleList l_funcDimms; + getChildAffinityTargetsByState( l_funcDimms, + l_proc, + CLASS_LOGICAL_CARD, + TYPE_DIMM, + UTIL_FILTER_FUNCTIONAL); + + //Pick the first proc we find with dimms + if (l_funcDimms.size() > 0) + { + l_temp_proc_mem_to_use = getGroupChipIdInfo(l_proc); + break; + } + + } + + ///////////////////////////////////////////////////////////// + //Step 2 -- Get the proc we are currently using the memory of + // and check if it has memory + ///////////////////////////////////////////////////////////// + //get the proc pointed by PROC_MEM_TO_USE and check + //if there is memory behind that proc. We rely on the current + //value of PROC_MEM_TO_USE, so, we don't change our answer + //unnecessarily (in cases when both master proc and altmaster + //have memory) + auto l_grp = (io_proc_mem_to_use >> 3); + auto l_chip = (io_proc_mem_to_use & 0x07); // last three bits are chipId + PredicateAttrVal<ATTR_FABRIC_GROUP_ID> l_predGrp (l_grp); + PredicateAttrVal<ATTR_FABRIC_CHIP_ID> l_predChip (l_chip); + PredicateCTM l_predProc (CLASS_CHIP, TYPE_PROC); + PredicateIsFunctional l_isFunctional; + PredicatePostfixExpr l_procCheckExpr; + + l_procCheckExpr.push(&l_predProc).push(&l_isFunctional). + push(&l_predGrp).push(&l_predChip).And().And().And(); + + TargetHandleList l_procMemUsedCurrently; + targetService().getAssociated(l_procMemUsedCurrently, + i_node, + TargetService::CHILD_BY_AFFINITY, + TargetService::IMMEDIATE, + &l_procCheckExpr); + + HWAS_INF("check_for_missing_memory: looking for a proc with " + "grp=0x%x chip=0x%x, found %d procs", + l_grp, l_chip, l_procMemUsedCurrently.size()); + + if (l_procMemUsedCurrently.size() >= 1) + { + //found proc + //Check if proc whose memory we are currently using has dimms + TargetHandleList l_funcDimms; + getChildAffinityTargetsByState( l_funcDimms, + l_procMemUsedCurrently[0], + CLASS_LOGICAL_CARD, + TYPE_DIMM, + UTIL_FILTER_FUNCTIONAL); + if (l_funcDimms.size() > 0) + { + //we found dimms behind the proc we are currently using + o_found_missing_mem = false; + } + } + + + ///////////////////////////////////////////////////////////// + //Step 3-- If a proc with lower group/chip id has memory or + // there is no memory behind the currently used proc, + // then we update the proc_mem_to_use + //NOTE: This ensures that if someone replaces the dimm on a lowered + // number proc, then we can fall back to that lowered number + // proc. Also, it makes sure that we are updating only when + // current proc_mem_to_use doesn't have memory or it's not + // pointing to a valid proc. + ///////////////////////////////////////////////////////////// + if ((l_temp_proc_mem_to_use < io_proc_mem_to_use) + || (o_found_missing_mem)) + { + HWAS_INF("check_for_missing_memory: found a need to switch" + " PROC_MEM_TO_USE from 0x%x to 0x%x", + io_proc_mem_to_use, l_temp_proc_mem_to_use); + io_proc_mem_to_use = l_temp_proc_mem_to_use; + } + else + { + HWAS_INF("check_for_missing_memory: kept PROC_MEM_TO_USE same" + " 0x%x", io_proc_mem_to_use); + } + + + } while (0); + + return l_errl; +} + + errlHndl_t discoverTargets() { HWAS_DBG("discoverTargets entry"); diff --git a/src/usr/isteps/istep07/call_mss_attr_update.C b/src/usr/isteps/istep07/call_mss_attr_update.C index f4eb9fb1d..eae864a7f 100644 --- a/src/usr/isteps/istep07/call_mss_attr_update.C +++ b/src/usr/isteps/istep07/call_mss_attr_update.C @@ -52,6 +52,9 @@ #include <targeting/common/commontargeting.H> #include <targeting/common/utilFilter.H> +// HWAS +#include <hwas/common/hwas.H> + // fapi2 support #include <fapi2.H> #include <fapi2/target.H> @@ -119,9 +122,6 @@ errlHndl_t check_proc0_memory_config(IStepError & io_istepErr) TargetHandleList l_procsList; getAllChips(l_procsList, TYPE_PROC); - TARGETING::Target * l_sys = NULL; - TARGETING::targetService().getTopLevelTarget(l_sys); - // Loop through all procs getting IDs procIds_t l_procIds[l_procsList.size()]; uint8_t i = 0; @@ -190,9 +190,6 @@ errlHndl_t check_proc0_memory_config(IStepError & io_istepErr) TargetService::ALL, &l_checkExprFunctional); - TARGETING::ATTR_PAYLOAD_KIND_type payload_kind = - l_sys->getAttr<TARGETING::ATTR_PAYLOAD_KIND>(); - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "check_proc0_memory_config: %d functional dimms behind proc0 " "%.8X", @@ -232,42 +229,6 @@ errlHndl_t check_proc0_memory_config(IStepError & io_istepErr) continue; } - // If our master proc doesn't have memory, and we're on a phyp - // system, we want to use this proc's memory instead. -#if 0 - // TODO RTC: 181139. This support can not be put into place - // until we're able to use the Get Capabilities function - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "check_proc0_memory_config: Payload kind is %llx", - payload_kind); - if(payload_kind == TARGETING::PAYLOAD_KIND_PHYP) - { - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "check_proc0_memory_config: We are in a PHYP system, " - "setting master to use alt memory from proc %llx.", - get_huid(l_procIds[i].proc)); - - uint8_t l_chipID = l_procIds[i].chipId; - uint8_t l_groupID = l_procIds[i].groupId; - - TargetHandle_t l_masterProc = NULL; - targetService().masterProcChipTargetHandle(l_masterProc); - - uint8_t l_proc_memory = l_masterProc->getAttr< - TARGETING::ATTR_PROC_MEM_TO_USE>(); - - if( l_proc_memory != ((l_groupID <<3) | l_chipID)) - { - l_masterProc->setAttr<TARGETING::ATTR_PROC_MEM_TO_USE>( - ((l_groupID << 3) | l_chipID)); - - l_updateNeeded = true; - // Leave loop after switching memory - break; - } - }else - { -#endif // Use this proc for swapping memory with proc0 l_victim = i; @@ -293,106 +254,50 @@ errlHndl_t check_proc0_memory_config(IStepError & io_istepErr) // Leave loop after swapping memory break; -#if 0 - } -#endif - } - - if(payload_kind != TARGETING::PAYLOAD_KIND_PHYP) - { - // Check that a victim was found - assert( l_victim < l_procsList.size(), "No swap match found" ); } - } -#if 0 - // TODO RTC: 181139. This support can not be put into place - // until we're able to use the Get Capabilities function - else if( !(l_dimms.empty()) && - (payload_kind == TARGETING::PAYLOAD_KIND_PHYP) ) - { - // If the memory isn't empty, and we're on a phyp system, - // we want to verify that we're set up to use the correct memory - uint8_t l_chipID = l_procIds[i].chipId; - uint8_t l_groupID = l_procIds[i].groupId; - - TargetHandle_t l_masterProc = NULL; - targetService().masterProcChipTargetHandle(l_masterProc); - uint8_t l_proc_memory = - l_masterProc->getAttr<TARGETING::ATTR_PROC_MEM_TO_USE>(); - - if( l_proc_memory != ((l_groupID <<3) | l_chipID)) - { - l_masterProc->setAttr<TARGETING::ATTR_PROC_MEM_TO_USE>( - ((l_groupID << 3) | l_chipID)); - - l_updateNeeded = true; - } } -#endif - if(payload_kind != TARGETING::PAYLOAD_KIND_PHYP) + // Loop through all procs detecting that IDs are set correctly + for (i = 0; i < l_procsList.size(); i++) { -#if 0 - // TODO RTC: 181139. This support can not be put into place - // until we're able to use the Get Capabilities function - TargetHandle_t l_masterProc = NULL; - targetService().masterProcChipTargetHandle(l_masterProc); - - // Check the attribute, and default it to proc0 if - // it doesn't match. - uint8_t l_proc_memory = - l_masterProc->getAttr<TARGETING::ATTR_PROC_MEM_TO_USE>(); - - if( l_proc_memory != 0) + TRACDCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_proc0_memory_config: Compare settings for " + "Proc %.8X\n" + " groupIdEff = %d, groupId = %d\n" + " chipIdEff = %d, chipId = %d", + get_huid(l_procIds[i].proc), + l_procIds[i].groupIdEff, + l_procIds[i].groupId, + l_procIds[i].chipIdEff, + l_procIds[i].chipId); + + if((l_procIds[i].groupId != l_procIds[i].groupIdEff) || + (l_procIds[i].chipId != l_procIds[i].chipIdEff) ) { - l_masterProc->setAttr<TARGETING::ATTR_PROC_MEM_TO_USE>(0); + // Update attributes + (l_procIds[i].proc)-> + setAttr<ATTR_PROC_EFF_FABRIC_GROUP_ID>(l_procIds[i].groupId); + (l_procIds[i].proc)-> + setAttr<ATTR_PROC_EFF_FABRIC_CHIP_ID>(l_procIds[i].chipId); l_updateNeeded = true; } -#endif - // Loop through all procs detecting that IDs are set correctly - for (i = 0; i < l_procsList.size(); i++) - { - TRACDCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "check_proc0_memory_config: Compare settings for " - "Proc %.8X\n" - " groupIdEff = %d, groupId = %d\n" - " chipIdEff = %d, chipId = %d", - get_huid(l_procIds[i].proc), - l_procIds[i].groupIdEff, - l_procIds[i].groupId, - l_procIds[i].chipIdEff, - l_procIds[i].chipId); - - if((l_procIds[i].groupId != l_procIds[i].groupIdEff) || - (l_procIds[i].chipId != l_procIds[i].chipIdEff) ) - { - // Update attributes - (l_procIds[i].proc)-> - setAttr<ATTR_PROC_EFF_FABRIC_GROUP_ID>(l_procIds[i].groupId); - (l_procIds[i].proc)-> - setAttr<ATTR_PROC_EFF_FABRIC_CHIP_ID>(l_procIds[i].chipId); - - l_updateNeeded = true; - } - - TRACDCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "check_proc0_memory_config: Current attribute " - "settings for Proc %.8X\n" - " ATTR_PROC_EFF_FABRIC_GROUP_ID = %d\n" - " ATTR_FABRIC_GROUP_ID = %d\n" - " ATTR_PROC_EFF_FABRIC_CHIP_ID = %d\n" - " ATTR_FABRIC_CHIP_ID = %d", - get_huid(l_procIds[i].proc), - (l_procIds[i].proc)-> - getAttr<ATTR_PROC_EFF_FABRIC_GROUP_ID>(), - (l_procIds[i].proc)->getAttr<ATTR_FABRIC_GROUP_ID>(), - (l_procIds[i].proc)-> - getAttr<ATTR_PROC_EFF_FABRIC_CHIP_ID>(), - (l_procIds[i].proc)->getAttr<ATTR_FABRIC_CHIP_ID>()); - } + TRACDCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_proc0_memory_config: Current attribute " + "settings for Proc %.8X\n" + " ATTR_PROC_EFF_FABRIC_GROUP_ID = %d\n" + " ATTR_FABRIC_GROUP_ID = %d\n" + " ATTR_PROC_EFF_FABRIC_CHIP_ID = %d\n" + " ATTR_FABRIC_CHIP_ID = %d", + get_huid(l_procIds[i].proc), + (l_procIds[i].proc)-> + getAttr<ATTR_PROC_EFF_FABRIC_GROUP_ID>(), + (l_procIds[i].proc)->getAttr<ATTR_FABRIC_GROUP_ID>(), + (l_procIds[i].proc)-> + getAttr<ATTR_PROC_EFF_FABRIC_CHIP_ID>(), + (l_procIds[i].proc)->getAttr<ATTR_FABRIC_CHIP_ID>()); } if(l_updateNeeded) @@ -451,27 +356,41 @@ void* call_mss_attr_update( void *io_pArgs ) TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "call_mss_attr_update entry"); errlHndl_t l_err = NULL; - // Check the memory on proc0 chip - l_err = check_proc0_memory_config(l_StepError); - - if (l_err) + do { - TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, - "ERROR 0x%.8X: check_proc0_memory_config", - l_err->reasonCode()); + bool l_isPhyp = TARGETING::is_phyp_load(); + bool l_spEnabled = INITSERVICE::spBaseServicesEnabled(); + + // Check the memory on master proc chip + // Use this mechanism for: + // non-phyp case or + // PHYP on OpenPower machine + if (!l_isPhyp || (l_isPhyp && !l_spEnabled)) + { + l_err = check_proc0_memory_config(l_StepError); - // Ensure istep error created and has same plid as this error - l_StepError.addErrorDetails( l_err ); - errlCommit( l_err, HWPF_COMP_ID ); - } - else - { - TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, - "SUCCESS: check_proc0_memory_config"); - } + if (l_err) + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR 0x%.8X: check_proc0_memory_config", + l_err->reasonCode()); + + // Ensure istep error created and has same plid as this error + l_StepError.addErrorDetails( l_err ); + errlCommit( l_err, HWPF_COMP_ID ); + break; + } + else + { + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "SUCCESS: check_proc0_memory_config"); + } + } + else + { + //TODO -- next commit adds the logic for this case + } - if (l_StepError.isNull()) - { // Get all functional MCS chiplets TARGETING::TargetHandleList l_mcsTargetList; getAllChiplets(l_mcsTargetList, TYPE_MCS); @@ -495,8 +414,7 @@ void* call_mss_attr_update( void *io_pArgs ) errlCommit( l_err, HWPF_COMP_ID ); } } - } - + } while (0); TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, "call_mss_attr_update exit" ); return l_StepError.getErrorHandle(); |