diff options
author | Prachi Gupta <pragupta@us.ibm.com> | 2018-05-31 16:31:01 -0500 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2018-06-06 13:59:44 -0400 |
commit | 27bf395be2cd9025abc1a48ad74a0b3bc5da97f8 (patch) | |
tree | 29bb599328e3ab98e1e105a285c9f213cf040871 | |
parent | 1db54dcc27d4061114c9466ec5fb72121420c5eb (diff) | |
download | talos-hostboot-27bf395be2cd9025abc1a48ad74a0b3bc5da97f8.tar.gz talos-hostboot-27bf395be2cd9025abc1a48ad74a0b3bc5da97f8.zip |
missing memory: istep 7 and 14 changes
There are two cases where hostboot's attention is required in istep7:
- If HRMOR we booted with doesn't fall in the range of proc_mem_to_use's
memory, then the SBE is old. HB will do an sbe update and
request re-ipl
- If HB deconfigured a bunch of dimms in istep7 and ran out of memory,
then we will request a reconfig loop
Then, in istep14, we added another sanity check to make sure we have
memory as expected to prevent unexpected failure after exiting cache
contained mode.
Change-Id: I018f4ce862cc79b5d7bacbe01cc28d1d2b4fc788
CQ:SW430015
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/59696
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
-rw-r--r-- | src/include/usr/hwas/common/hwas.H | 33 | ||||
-rw-r--r-- | src/include/usr/isteps/istep_reasoncodes.H | 1 | ||||
-rw-r--r-- | src/include/usr/targeting/common/util.H | 18 | ||||
-rw-r--r-- | src/usr/hwas/common/hwas.C | 72 | ||||
-rw-r--r-- | src/usr/isteps/istep07/call_mss_attr_update.C | 132 | ||||
-rw-r--r-- | src/usr/isteps/istep14/call_proc_exit_cache_contained.C | 40 | ||||
-rw-r--r-- | src/usr/targeting/common/util.C | 92 |
7 files changed, 348 insertions, 40 deletions
diff --git a/src/include/usr/hwas/common/hwas.H b/src/include/usr/hwas/common/hwas.H index 06304c29c..120ba0b7d 100644 --- a/src/include/usr/hwas/common/hwas.H +++ b/src/include/usr/hwas/common/hwas.H @@ -258,6 +258,39 @@ errlHndl_t check_for_missing_memory (const TARGETING::Target* i_node, uint8_t & io_proc_mem_to_use, bool & o_found_missing_mem); +/* + * @brief This function takes in proc target and returns group/chip id + * in the following bit format: GGGG CCC + * where G = Group Id and C = Chip Id + * + * @param[in] i_proc: proc target + * @retval: chip info including group and chip id + */ +uint64_t getGroupChipIdInfo (TARGETING::TargetHandle_t i_proc); + +/* + * @brief This function takes in the value of ATTR_PROC_MEM_TO_USE + * and extract out group and chip id + * in the following bit format: GGGG CCC + * where G = Group Id and C = Chip Id + * + * @param[in] i_proc_mem_to_use: Value of ATTR_PROC_MEM_TO_USE + * @param[out] o_grp_id: group id + * @param[out] o_chip_id: chip id + */ +void parseProcMemToUseIntoGrpChipId (uint8_t i_proc_mem_to_use, + uint8_t & o_grp_id, + uint8_t & o_chip_id); + +/* + * @brief This function computes whether current value of + * PROC_MEM_TO_USE matches with the expected value. + * The expected value can change through the IPL because + * we might end up deconfiguring dimms. + * + * @param[out] o_valid: true, if current and expected values are the same + */ +errlHndl_t check_current_proc_mem_to_use_is_still_valid (bool o_valid); }; // end namespace diff --git a/src/include/usr/isteps/istep_reasoncodes.H b/src/include/usr/isteps/istep_reasoncodes.H index aa63ce0d8..31bb8a7bb 100644 --- a/src/include/usr/isteps/istep_reasoncodes.H +++ b/src/include/usr/isteps/istep_reasoncodes.H @@ -126,6 +126,7 @@ namespace ISTEP RC_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x44, RC_ULTRA_TURBO_FREQ_MISMATCH = ISTEP_COMP_ID | 0x45, RC_NEST_FREQ_MISMATCH = ISTEP_COMP_ID | 0x46, + RC_NO_VALID_MEM_CONFIG = ISTEP_COMP_ID | 0x47, }; }; diff --git a/src/include/usr/targeting/common/util.H b/src/include/usr/targeting/common/util.H index edc67b79e..479cdfa75 100644 --- a/src/include/usr/targeting/common/util.H +++ b/src/include/usr/targeting/common/util.H @@ -158,12 +158,26 @@ bool is_avp_load(void); /** * @brief Utility function to obtain the highest known address in the system */ -uint64_t get_top_mem_addr(void); +uint64_t get_top_mem_addr(); + +/** + * @brief Utility function to obtain the highest known address in a given proc + * + * @param[in] i_proc: Proc that we want to calculate the top address for + */ +uint64_t get_top_mem_addr(Target* i_proc); /** * @brief Utility function to obtain the lowest known address in the system */ -uint64_t get_bottom_mem_addr(void); +uint64_t get_bottom_mem_addr(); + +/** + * @brief Utility function to obtain the lowest known address in a given proc + * + * @param[in] i_proc: Proc that we want to calculate the bottom address for + */ +uint64_t get_bottom_mem_addr(Target* i_proc); /** * Order two processor targets by NODE_ID then CHIP_ID. diff --git a/src/usr/hwas/common/hwas.C b/src/usr/hwas/common/hwas.C index c00d8cf78..e9dabaae5 100644 --- a/src/usr/hwas/common/hwas.C +++ b/src/usr/hwas/common/hwas.C @@ -105,6 +105,24 @@ uint64_t getGroupChipIdInfo (TargetHandle_t i_proc) return ((l_grp_id << 3) | l_chip_id); } +/* + * @brief This function takes in the value of ATTR_PROC_MEM_TO_USE + * and extract out group and chip id + * in the following bit format: GGGG CCC + * where G = Group Id and C = Chip Id + * + * @param[in] i_proc_mem_to_use: Value of ATTR_PROC_MEM_TO_USE + * @param[out] o_grp_id: groupd id + * @param[out] o_chip_id: chip id + */ +void parseProcMemToUseIntoGrpChipId (uint8_t i_proc_mem_to_use, + uint8_t & o_grp_id, + uint8_t & o_chip_id) +{ + o_grp_id = (i_proc_mem_to_use >> 3) & 0x0F; + o_chip_id = i_proc_mem_to_use & 0x07; +} + /** * @brief simple helper fn to get and set hwas state to poweredOn, * present, functional @@ -454,8 +472,9 @@ errlHndl_t check_for_missing_memory (const Target* i_node, //value of PROC_MEM_TO_USE, so, we don't change our answer //unnecessarily (in cases when both master proc and altmaster //have memory) - auto l_grp = (io_proc_mem_to_use >> 3); - auto l_chip = (io_proc_mem_to_use & 0x07); // last three bits are chipId + uint8_t l_grp = 0; + uint8_t l_chip = 0; + parseProcMemToUseIntoGrpChipId(io_proc_mem_to_use, l_grp, l_chip); PredicateAttrVal<ATTR_FABRIC_GROUP_ID> l_predGrp (l_grp); PredicateAttrVal<ATTR_FABRIC_CHIP_ID> l_predChip (l_chip); PredicateCTM l_predProc (CLASS_CHIP, TYPE_PROC); @@ -495,7 +514,8 @@ errlHndl_t check_for_missing_memory (const Target* i_node, ///////////////////////////////////////////////////////////// - //Step 3-- If a proc with lower group/chip id has memory or + //Step 3-- If proc picked in Step1 has lower group/chip id + // than current proc_mem_to_use value or // there is no memory behind the currently used proc, // then we update the proc_mem_to_use //NOTE: This ensures that if someone replaces the dimm on a lowered @@ -524,6 +544,52 @@ errlHndl_t check_for_missing_memory (const Target* i_node, return l_errl; } +errlHndl_t check_current_proc_mem_to_use_is_still_valid (bool o_match) +{ + errlHndl_t l_err {nullptr}; + o_match = true; + do + { + //Get the master proc to get the current value of PROC_MEM_TO_USE + TargetHandle_t l_mProc; + l_err = targetService().queryMasterProcChipTargetHandle(l_mProc); + if (l_err) + { + HWAS_ERR("ERROR: getting master proc"); + break; + } + + auto l_proc_mem_to_use = l_mProc->getAttr<ATTR_PROC_MEM_TO_USE>(); + + //Get the node target to pass to check_for_missing_memory + TargetHandleList l_nodes; + getEncResources(l_nodes, TYPE_NODE, UTIL_FILTER_FUNCTIONAL); + HWAS_ASSERT((l_nodes.size() == 1), "Only expecting 1 functional node"); + + auto l_curr_proc_mem_to_use = l_proc_mem_to_use; + bool l_found_missing_mem {false}; + l_err = HWAS::check_for_missing_memory(l_nodes[0], + l_proc_mem_to_use, + l_found_missing_mem); + if (l_err) + { + HWAS_ERR("ERROR: check_for_missing_memory"); + break; + } + + HWAS_INF("PROC_MEM_TO_USE currentVal=0x%x reComputedVal=0x%x", + l_curr_proc_mem_to_use, l_proc_mem_to_use); + + if (l_curr_proc_mem_to_use != l_proc_mem_to_use) + { + HWAS_INF("check_current_proc_mem_to_use_is_still_valid: " + "currentVal and reComputerVal don't match"); + o_match = false; + } + } while (0); + + return l_err; +} errlHndl_t discoverTargets() { diff --git a/src/usr/isteps/istep07/call_mss_attr_update.C b/src/usr/isteps/istep07/call_mss_attr_update.C index eae864a7f..e05a2b3c4 100644 --- a/src/usr/isteps/istep07/call_mss_attr_update.C +++ b/src/usr/isteps/istep07/call_mss_attr_update.C @@ -44,6 +44,7 @@ #include <errl/errludtarget.H> #include <initservice/isteps_trace.H> #include <initservice/initserviceif.H> +#include <initservice/initsvcreasoncodes.H> // SBE #include <sbeif.H> @@ -65,6 +66,9 @@ // HWP #include <p9_mss_attr_update.H> +//HRMOR +#include <sys/misc.h> + namespace ISTEP_07 { @@ -345,6 +349,82 @@ errlHndl_t check_proc0_memory_config(IStepError & io_istepErr) return l_err; } // end check_proc0_memory_config() +void check_hrmor_within_range (ATTR_PROC_MEM_TO_USE_type i_proc_mem_to_use, + IStepError & io_StepError) +{ + errlHndl_t l_err {nullptr}; + + //extract group and chip id from PROC_MEM_TO_USE attribute + uint8_t l_grp {0}; + uint8_t l_chip {0}; + HWAS::parseProcMemToUseIntoGrpChipId(i_proc_mem_to_use, l_grp, l_chip); + + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_hrmor_within_range: PROC_MEM_TO_USE=0x%x,Grp=0x%x,Chip=0x%x", + i_proc_mem_to_use, l_grp, l_chip); + + //Find a proc that matches current proc_mem_to_use's group/chip id + TargetHandleList l_procs; + getAllChips(l_procs, TYPE_PROC); + + TargetHandle_t l_procTgtMemUsed {nullptr}; + for (auto & l_proc : l_procs) + { + auto l_proc_grp = l_proc->getAttr<ATTR_FABRIC_GROUP_ID>(); + auto l_proc_chip = l_proc->getAttr<ATTR_FABRIC_CHIP_ID>(); + + if ((l_proc_grp == l_grp) && (l_proc_chip == l_chip)) + { + l_procTgtMemUsed = l_proc; + break; + } + } + + + //if we find it, then we check that the hrmor in within + //range of configured mem. + // + //Otherwise, we want to go down the sbe upate and TI path + bool l_sbeUpdateTIRequired = true; + if (l_procTgtMemUsed) + { + auto l_lowest_mem_addr = get_bottom_mem_addr(l_procTgtMemUsed); + auto l_highest_mem_addr = get_top_mem_addr(l_procTgtMemUsed); + auto l_hrmor = cpu_spr_value(CPU_SPR_HRMOR); + + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_hrmor_within_range: proc picked: 0x%x, lowest addr=0x%x, " + "highest addr=0x%x HRMOR=0x%x", get_huid(l_procTgtMemUsed), + l_lowest_mem_addr, l_highest_mem_addr, l_hrmor); + + if ((l_lowest_mem_addr <= l_hrmor) && (l_hrmor < l_highest_mem_addr)) + { + //we are good -- no need for TI + l_sbeUpdateTIRequired = false; + } + } + + if (l_sbeUpdateTIRequired) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_hrmor_within_range: sbe is downleveled - update required"); + + // Rebuild SBE image and trigger reconfig loop + l_err = SBE::updateProcessorSbeSeeproms(); + if( l_err ) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR: updateProcessorSbeSeeproms"); + io_StepError.addErrorDetails(l_err); + errlCommit(l_err, HWPF_COMP_ID); + } + } + else + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "check_hrmor_within_range: sbe update is NOT required"); + } +} // // Wrapper function to call mss_attr_update @@ -386,9 +466,59 @@ void* call_mss_attr_update( void *io_pArgs ) "SUCCESS: check_proc0_memory_config"); } } + // For phyp based systems on FSP, HWSV will call + // HWAS::update_proc_mem_to_use function to determine the new + // proc to use for memory and update SBE scratch registers as + // necessary. HB just needs to tell HWSV to do that. There are + // only two cases where HB will want HWSV to attempt the above + // logic. + // 1) HRMOR that we booted doesn't match the current value + // of PROC_MEM_TO_USE attribute. This can only happen if the + // SBE is really old. So, force an sbe update and TI. + // 2) HB deconfigured a bunch of dimms in istep7. In this case, + // HB computes new value of PROC_MEM_TO_USE and checks it + // against current value of PROC_MEM_TO_USE. If they don't + // match, HB will force a reconfig loop TI else { - //TODO -- next commit adds the logic for this case + + ////////////////////////////////////////////////////////////////// + //Case1 from above, where HRMOR doesn't fall in configured mem range + //of proc pointed by ATTR_PROC_MEM_TO_USE + ////////////////////////////////////////////////////////////////// + + //Get the master proc to get the current value of PROC_MEM_TO_USE + TargetHandle_t l_mProc; + l_err = targetService().queryMasterProcChipTargetHandle(l_mProc); + if (l_err) + { + TRACFCOMP (ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR: getting master proc"); + l_StepError.addErrorDetails(l_err); + errlCommit( l_err, HWPF_COMP_ID ); + break; + } + + auto l_proc_mem_to_use = l_mProc->getAttr<ATTR_PROC_MEM_TO_USE>(); + check_hrmor_within_range(l_proc_mem_to_use, l_StepError); + + ////////////////////////////////////////////////////////////////// + //Case2 from above, where HB deconfigured dimms, so, we need to + //recompute PROC_MEM_TO_USE and if it is not the same TI + ////////////////////////////////////////////////////////////////// + bool l_valid {true}; + l_err=HWAS::check_current_proc_mem_to_use_is_still_valid (l_valid); + if (l_err || !l_valid) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR: check_current_proc_mem_to_use_is_still_valid" + " going down for a reconfig loop"); + //We deconfigured a bunch of dimms and the answer + //changed for which proc's memory to use. Trigger + //reconfig loop TI + INITSERVICE::doShutdown(INITSERVICE::SHUTDOWN_DO_RECONFIG_LOOP); + + } } // Get all functional MCS chiplets diff --git a/src/usr/isteps/istep14/call_proc_exit_cache_contained.C b/src/usr/isteps/istep14/call_proc_exit_cache_contained.C index 0d7699179..999b4bb2d 100644 --- a/src/usr/isteps/istep14/call_proc_exit_cache_contained.C +++ b/src/usr/isteps/istep14/call_proc_exit_cache_contained.C @@ -29,6 +29,7 @@ #include <isteps/hwpisteperror.H> #include <initservice/isteps_trace.H> #include <initservice/taskargs.H> +#include <initservice/initserviceif.H> // targeting support #include <targeting/common/commontargeting.H> @@ -128,6 +129,45 @@ void* call_proc_exit_cache_contained (void *io_pArgs) ISTEP::RC_MIN_HW_CHECK_FAILED); } + if (!l_errl) + { + bool l_valid {true}; + l_errl = HWAS::check_current_proc_mem_to_use_is_still_valid (l_valid); + if (l_errl || !l_valid) + { + //We deconfigured a bunch of dimms and the answer + //changed for which proc's memory to use. Give up TI + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR: check_current_proc_mem_to_use_is_still_valid" + " going to TI"); + + if (!l_errl) + { + /*@ + * @errortype ERRL_SEV_UNRECOVERABLE + * @moduleid ISTEP::MOD_PROC_EXIT_CACHE_CONTAINED + * @reasoncode ISTEP::RC_NO_VALID_MEM_CONFIG + * @devdesc call_proc_exit_cache_contained: did not + * find valid memory configuration + * @custdesc Host firmware did not find valid + * hardware to continue the boot + */ + l_errl = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_UNRECOVERABLE, + ISTEP::MOD_PROC_EXIT_CACHE_CONTAINED, + ISTEP::RC_NO_VALID_MEM_CONFIG); + + l_errl->addProcedureCallout( + HWAS::EPUB_PRC_SP_CODE, + HWAS::SRCI_PRIORITY_HIGH ); + + l_errl->addProcedureCallout( + HWAS::EPUB_PRC_FIND_DECONFIGURED_PART, + HWAS::SRCI_PRIORITY_HIGH ); + } + } + } + uint8_t l_mpipl = 0; TARGETING::TargetHandleList l_procList; if (!l_errl) diff --git a/src/usr/targeting/common/util.C b/src/usr/targeting/common/util.C index c1be41203..26b63966e 100644 --- a/src/usr/targeting/common/util.C +++ b/src/usr/targeting/common/util.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2017 */ +/* Contributors Listed Below - COPYRIGHT 2012,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -36,6 +36,7 @@ #include <targeting/common/attributes.H> #include <targeting/common/targetservice.H> #include <targeting/common/utilFilter.H> +#include <targeting/common/trace.H> namespace TARGETING { @@ -216,24 +217,36 @@ uint64_t get_top_mem_addr(void) for ( size_t proc = 0; proc < l_cpuTargetList.size(); proc++ ) { TARGETING::Target * l_pProc = l_cpuTargetList[proc]; + top_addr = std::max(top_addr,get_top_mem_addr(l_pProc)); + } - //Not checking success here as fail results in no change to - // top_addr - uint64_t l_mem_bases[8] = {0,}; - uint64_t l_mem_sizes[8] = {0,}; - l_pProc->tryGetAttr<TARGETING::ATTR_PROC_MEM_BASES>(l_mem_bases); - l_pProc->tryGetAttr<TARGETING::ATTR_PROC_MEM_SIZES>(l_mem_sizes); + } while(0); - for (size_t i=0; i< 8; i++) - { - if(l_mem_sizes[i]) //non zero means that there is memory present - { - top_addr = std::max(top_addr, - l_mem_bases[i] + l_mem_sizes[i]); - } - } + return top_addr; +} + +/** + * @brief Utility function to obtain the highest known address in a given proc + */ +uint64_t get_top_mem_addr(TargetHandle_t i_proc) +{ + uint64_t top_addr = 0; + + //Not checking success here as fail results in no change to + // top_addr + uint64_t l_mem_bases[8] = {0,}; + uint64_t l_mem_sizes[8] = {0,}; + i_proc->tryGetAttr<TARGETING::ATTR_PROC_MEM_BASES>(l_mem_bases); + i_proc->tryGetAttr<TARGETING::ATTR_PROC_MEM_SIZES>(l_mem_sizes); + + for (size_t i=0; i< 8; i++) + { + if(l_mem_sizes[i]) //non zero means that there is memory present + { + top_addr = std::max(top_addr, + l_mem_bases[i] + l_mem_sizes[i]); } - }while(0); + } return top_addr; } @@ -254,24 +267,7 @@ uint64_t get_bottom_mem_addr(void) for ( size_t proc = 0; proc < l_cpuTargetList.size(); proc++ ) { TARGETING::Target * l_pProc = l_cpuTargetList[proc]; - - uint64_t l_mem_bases[8] = {}; - uint64_t l_mem_sizes[8] = {}; - TARG_ASSERT( - l_pProc->tryGetAttr<TARGETING::ATTR_PROC_MEM_BASES>(l_mem_bases), - "Unable to get ATTR_PROC_MEM_BASES attribute"); - - TARG_ASSERT( - l_pProc->tryGetAttr<TARGETING::ATTR_PROC_MEM_SIZES>(l_mem_sizes), - "Unable to get ATTR_PROC_MEM_SIZES attribute"); - - for (size_t i=0; i< 8; i++) - { - if(l_mem_sizes[i]) //non zero means that there is memory present - { - bottom_addr = std::min(bottom_addr, l_mem_bases[i]); - } - } + bottom_addr = std::min(bottom_addr, get_bottom_mem_addr(l_pProc)); } }while(0); @@ -285,6 +281,34 @@ uint64_t get_bottom_mem_addr(void) } +/** + * @brief Utility function to obtain the lowest known address in a given proc + */ +uint64_t get_bottom_mem_addr(TargetHandle_t i_proc) +{ + uint64_t bottom_addr = UINT64_MAX; + + uint64_t l_mem_bases[8] = {}; + uint64_t l_mem_sizes[8] = {}; + TARG_ASSERT( + i_proc->tryGetAttr<TARGETING::ATTR_PROC_MEM_BASES>(l_mem_bases), + "Unable to get ATTR_PROC_MEM_BASES attribute"); + + TARG_ASSERT( + i_proc->tryGetAttr<TARGETING::ATTR_PROC_MEM_SIZES>(l_mem_sizes), + "Unable to get ATTR_PROC_MEM_SIZES attribute"); + + for (size_t i=0; i< 8; i++) + { + if(l_mem_sizes[i]) //non zero means that there is memory present + { + bottom_addr = std::min(bottom_addr, l_mem_bases[i]); + } + } + + return bottom_addr; +} + bool orderByNodeAndPosition( Target* i_firstProc, Target* i_secondProc) { |