From 22d6da8c6be6f5174cf9a923a80784ef1794429f Mon Sep 17 00:00:00 2001 From: Matt Derksen Date: Fri, 5 Apr 2019 16:50:18 -0500 Subject: NVDIMM info logs around update Information error logs around the start/end of individual NVDIMM updates. Also add in a 30 second retry window to nvdimm reads. Change-Id: I8f3e76e7dc811aad2817fa8c0d210c283ac61bcf Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/75678 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins Reviewed-by: Christian R. Geddes --- src/usr/isteps/nvdimm/nvdimm_update.C | 122 ++++++++++++++++++++++++++++++---- src/usr/isteps/nvdimm/nvdimmdd.C | 82 ++++++++--------------- 2 files changed, 136 insertions(+), 68 deletions(-) diff --git a/src/usr/isteps/nvdimm/nvdimm_update.C b/src/usr/isteps/nvdimm/nvdimm_update.C index ee9a731e2..4f235aa6c 100644 --- a/src/usr/isteps/nvdimm/nvdimm_update.C +++ b/src/usr/isteps/nvdimm/nvdimm_update.C @@ -815,8 +815,11 @@ errlHndl_t NvdimmInstalledImage::updateImageData(NvdimmLidImage * i_lidImage) uint16_t region = 0; while (region < fw_img_total_regions) { - if (region % 10 == 0) + if (region % 100 == 0) { + TRACFCOMP(g_trac_nvdimm_upd, + "updateImage: progress code for sending region %d", + region); INITSERVICE::sendProgressCode(); } TRACUCOMP(g_trac_nvdimm_upd, "updateImage: step 10.a - region 0x%04X", @@ -1708,22 +1711,15 @@ errlHndl_t NvdimmInstalledImage::resetController() } else { - TRACUCOMP(g_trac_nvdimm_upd,"resetController: waiting 5 seconds after controller 0x%.8X reset", - TARGETING::get_huid(iv_dimm)); - - // sleep 5 seconds to allow for i2c controller to come back online - nanosleep(5,0); - - TRACUCOMP(g_trac_nvdimm_upd,"resetController: now check if NV controller is ready again", - TARGETING::get_huid(iv_dimm)); - // Now wait until NV controller is ready again after reset + // nvdimmReady will retry NACK calls l_err = nvdimmReady(iv_dimm); if (l_err) { TRACFCOMP(g_trac_nvdimm_upd,ERR_MRK"resetController: NV controller for " "NVDIMM 0x%.8X is not reporting as ready after reset", TARGETING::get_huid(iv_dimm)); + } } return l_err; @@ -1785,14 +1781,71 @@ bool NvdimmsUpdate::runUpdateUsingLid(NvdimmLidImage * i_lidImage, } else if (updateNeeded) { + // shared trace variables + const TARGETING::Target * l_nvdimm = pInstalledImage->getNvdimmTarget(); + uint64_t l_nvdimm_huid = TARGETING::get_huid(l_nvdimm); + + uint32_t l_installed_type = INVALID_TYPE; + l_err = pInstalledImage->getType(l_installed_type); + if (l_err) + { + // Continue updating other dimms + TRACFCOMP(g_trac_nvdimm_upd, + ERR_MRK"NvdimmsUpdate::runUpdateUsingLid() - " + "Unable to get nvdimm[0x%.8X] installed image type. " + "RC=0x%X, PLID=0x%.8X", l_nvdimm_huid, + ERRL_GETRC_SAFE(l_err), ERRL_GETPLID_SAFE(l_err)); + commitPredictiveNvdimmError(l_err); + l_err = nullptr; + } + + uint16_t l_oldVersion = INVALID_VERSION; + l_err = pInstalledImage->getVersion(l_oldVersion); + if (l_err) + { + // This shouldn't happen as getVersion should return a + // cached version + TRACFCOMP(g_trac_nvdimm_upd, + ERR_MRK"NvdimmsUpdate::runUpdateUsingLid() - " + "Failed to find current NVDIMM level of %.8X. " + "RC=0x%X, PLID=0x%.8X", l_nvdimm_huid, + ERRL_GETRC_SAFE(l_err), ERRL_GETPLID_SAFE(l_err)); + commitPredictiveNvdimmError(l_err); + l_err = nullptr; + o_no_error_found = false; + continue; + } + // perform update for this DIMM with the current LID image TRACFCOMP(g_trac_nvdimm_upd, "NvdimmsUpdate::runUpdateUsingLid() - " - "now update nvdimm[0x%.8X]", - TARGETING::get_huid(pInstalledImage->getNvdimmTarget())); + "now update nvdimm[0x%.8X]", l_nvdimm_huid); TRACFCOMP(g_trac_nvdimm_upd,"Updating with flash size: 0x%08X", i_lidImage->getFlashImageSize()); + /* + *@errortype INFORMATIONAL + *@reasoncode NVDIMM_START_UPDATE + *@moduleid NVDIMM_RUN_UPDATE_USING_LID + *@userdata1 NVDIMM Target Huid + *@userdata2[0:15] Old level (current) + *@userdata2[16:31] Update image level (new) + *@userdata2[32:63] Installed type (manufacturer and product) + *@devdesc Start of the NVDIMM update of this controller + *@custdesc NVDIMM update started + */ + l_err = new ERRORLOG::ErrlEntry( ERRORLOG::ERRL_SEV_INFORMATIONAL, + NVDIMM_RUN_UPDATE_USING_LID, + NVDIMM_START_UPDATE, + l_nvdimm_huid, + TWO_UINT16_ONE_UINT32_TO_UINT64( + l_oldVersion, i_lidImage->getVersion(), + l_installed_type), + ERRORLOG::ErrlEntry::ADD_SW_CALLOUT); + l_err->collectTrace(NVDIMM_UPD, 256); + ERRORLOG::errlCommit(l_err, NVDIMM_COMP_ID); + l_err = nullptr; + l_err = pInstalledImage->updateImage(i_lidImage); if (l_err) { @@ -1803,13 +1856,54 @@ bool NvdimmsUpdate::runUpdateUsingLid(NvdimmLidImage * i_lidImage, TRACFCOMP(g_trac_nvdimm_upd, ERR_MRK"NvdimmsUpdate::runUpdateUsingLid() - " "NVDIMM 0x%.8X NV controller update failed. " - "RC=0x%X, PLID=0x%.8X", - TARGETING::get_huid(pInstalledImage->getNvdimmTarget()), + "RC=0x%X, PLID=0x%.8X", l_nvdimm_huid, ERRL_GETRC_SAFE(l_err), ERRL_GETPLID_SAFE(l_err)); commitPredictiveNvdimmError(l_err); l_err = nullptr; o_no_error_found = false; } + else + { + // successfully updated this NVDIMM + + // Note: call for version should just return a saved value + uint16_t curVersion = INVALID_VERSION; + l_err = pInstalledImage->getVersion(curVersion); + if (l_err) + { + TRACFCOMP(g_trac_nvdimm_upd, + ERR_MRK"NvdimmsUpdate::runUpdateUsingLid() - " + "Failed to find current NVDIMM level of %.8X after " + "successful update. RC=0x%X, PLID=0x%.8X", + l_nvdimm_huid, + ERRL_GETRC_SAFE(l_err), ERRL_GETPLID_SAFE(l_err)); + commitPredictiveNvdimmError(l_err); + l_err = nullptr; + } + + /* + *@errortype INFORMATIONAL + *@reasoncode NVDIMM_UPDATE_COMPLETE + *@moduleid NVDIMM_RUN_UPDATE_USING_LID + *@userdata1 NVDIMM Target Huid + *@userdata2[0:15] Previous level + *@userdata2[16:31] Current updated level + *@userdata2[32:63] Installed type (manufacturer and product) + *@devdesc Successful update of NVDIMM code + *@custdesc NVDIMM was successfully updated + */ + l_err = new ERRORLOG::ErrlEntry( + ERRORLOG::ERRL_SEV_INFORMATIONAL, + NVDIMM_RUN_UPDATE_USING_LID, + NVDIMM_UPDATE_COMPLETE, + l_nvdimm_huid, + TWO_UINT16_ONE_UINT32_TO_UINT64( + l_oldVersion, curVersion, + l_installed_type), + ERRORLOG::ErrlEntry::ADD_SW_CALLOUT ); + l_err->collectTrace(NVDIMM_UPD, 512); + ERRORLOG::errlCommit(l_err, NVDIMM_COMP_ID); + } } // end of updateNeeded } return o_no_error_found; diff --git a/src/usr/isteps/nvdimm/nvdimmdd.C b/src/usr/isteps/nvdimm/nvdimmdd.C index 730fe0271..84869266a 100755 --- a/src/usr/isteps/nvdimm/nvdimmdd.C +++ b/src/usr/isteps/nvdimm/nvdimmdd.C @@ -35,6 +35,7 @@ // Includes // ---------------------------------------------- #include +#include #include #include #include @@ -75,6 +76,7 @@ TRAC_INIT( & g_trac_nvdimmr, "NVDIMMR", KILOBYTE ); // ---------------------------------------------- #define MAX_BYTE_ADDR 2 #define NVDIMM_MAX_RETRIES 2 +#define MAX_READ_RETRY_SECS 30 // ---------------------------------------------- @@ -462,12 +464,13 @@ errlHndl_t nvdimmReadData( TARGETING::Target * i_target, ENTER_MRK"nvdimmReadData()"); do { + timespec_t l_CurTime, l_PrevTime; + clock_gettime(CLOCK_MONOTONIC, &l_PrevTime); + int retry = 0; /************************************************************/ /* Attempt read multiple times ONLY on retryable fails */ /************************************************************/ - for (uint8_t retry = 0; - retry <= NVDIMM_MAX_RETRIES; - retry++) + do { // Only write the byte address if we have data to write if( 0 != i_byteAddressSize ) @@ -567,64 +570,35 @@ errlHndl_t nvdimmReadData( TARGETING::Target * i_target, else // Handle retryable error { // If op will be attempted again: save log and continue - if ( retry < NVDIMM_MAX_RETRIES ) - { - // Only save original retryable error - if ( err_retryable == nullptr ) - { - // Save original retryable error - err_retryable = l_err; - - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmReadData(): " - "Retryable Error rc=0x%X, eid=0x%X, tgt=0x%X, " - "retry/MAX=%d/%d. Save error and retry", - err_retryable->reasonCode(), - err_retryable->eid(), - TARGETING::get_huid(i_target), - retry, NVDIMM_MAX_RETRIES); - - err_retryable->collectTrace(NVDIMM_COMP_NAME); - } - else - { - // Add data to original retryable error - TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmReadData(): " - "Another Retryable Error rc=0x%X, eid=0x%X " - "plid=0x%X, tgt=0x%X, retry/MAX=%d/%d. " - "Delete error and retry", - l_err->reasonCode(), l_err->eid(), l_err->plid(), - TARGETING::get_huid(i_target), - retry, NVDIMM_MAX_RETRIES); - - ERRORLOG::ErrlUserDetailsString( - "Another Retryable ERROR found") - .addToLog(err_retryable); - - // Delete this new retryable error - delete l_err; - l_err = nullptr; - } - - // continue to retry - continue; - } - else // no more retries: trace and break + // Only save original retryable error + if ( err_retryable == nullptr ) { + // Save original retryable error + err_retryable = l_err; + TRACFCOMP( g_trac_nvdimm, ERR_MRK"nvdimmReadData(): " - "Error rc=0x%X, eid=%d, tgt=0x%X. No More " - "Retries (retry/MAX=%d/%d). Returning Error", - l_err->reasonCode(), l_err->eid(), + "Retryable Error rc=0x%X, eid=0x%X, tgt=0x%X, " + "retry/MAX=%d/%d. Save error and retry", + err_retryable->reasonCode(), + err_retryable->eid(), TARGETING::get_huid(i_target), - retry, NVDIMM_MAX_RETRIES); - - l_err->collectTrace(NVDIMM_COMP_NAME); + retry); - // break from retry loop - break; + err_retryable->collectTrace(NVDIMM_COMP_NAME); } - } + else + { + // Delete this new retryable error + delete l_err; + l_err = nullptr; + } + } // retryable error + // update current time + clock_gettime(CLOCK_MONOTONIC, &l_CurTime); + retry++; } // end of retry loop + while( (l_CurTime.tv_sec - l_PrevTime.tv_sec) < MAX_READ_RETRY_SECS ); // Handle saved retryable error, if any if (err_retryable) -- cgit v1.2.1