/* IBM_PROLOG_BEGIN_TAG */ /* This is an automatically generated prolog. */ /* */ /* $Source: src/usr/diag/prdf/plat/mem/prdfMemTdCtlr_rt.C $ */ /* */ /* OpenPOWER HostBoot Project */ /* */ /* Contributors Listed Below - COPYRIGHT 2016,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ /* Licensed under the Apache License, Version 2.0 (the "License"); */ /* you may not use this file except in compliance with the License. */ /* You may obtain a copy of the License at */ /* */ /* http://www.apache.org/licenses/LICENSE-2.0 */ /* */ /* Unless required by applicable law or agreed to in writing, software */ /* distributed under the License is distributed on an "AS IS" BASIS, */ /* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ /* implied. See the License for the specific language governing */ /* permissions and limitations under the License. */ /* */ /* IBM_PROLOG_END_TAG */ /** @file prdfMemTdCtlr_rt.C * @brief A state machine for memory Targeted Diagnostics (runtime only). */ #include // Framework includes #include #include // Platform includes #include #include #include #include #include #include #include #include #include using namespace TARGETING; namespace PRDF { using namespace PlatServices; //------------------------------------------------------------------------------ template void __recaptureRegs( STEP_CODE_DATA_STRUCT & io_sc, ExtensibleChip * i_chip ); template<> void __recaptureRegs( STEP_CODE_DATA_STRUCT & io_sc, ExtensibleChip * i_chip ) { #define PRDF_FUNC "[__recaptureRegs] " RegDataCache & cache = RegDataCache::getCachedRegisters(); CaptureData & cd = io_sc.service_data->GetCaptureData(); // refresh and recapture the mcb registers const char * mcbRegs[] = { "MCBISTFIR", "MBSEC0", "MBSEC1", "MCB_MBSSYMEC0", "MCB_MBSSYMEC1", "MCB_MBSSYMEC2", "MCB_MBSSYMEC3", "MCB_MBSSYMEC4", "MCB_MBSSYMEC5", "MCB_MBSSYMEC6", "MCB_MBSSYMEC7", "MCB_MBSSYMEC8", "MBSMSEC", "MCBMCAT", }; for ( uint32_t i = 0; i < sizeof(mcbRegs)/sizeof(char*); i++ ) { SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( mcbRegs[i] ); cache.flush( i_chip, reg ); } i_chip->CaptureErrorData( cd, Util::hashString("MaintCmdRegs_mcb") ); // refresh and recapture the mca registers const char * mcaRegs[] = { "MCAECCFIR", }; ExtensibleChipList mcaList = getConnected( i_chip, TYPE_MCA ); for ( auto & mca : mcaList ) { for ( uint32_t i = 0; i < sizeof(mcaRegs)/sizeof(char*); i++ ) { SCAN_COMM_REGISTER_CLASS * reg = mca->getRegister( mcaRegs[i] ); cache.flush( mca, reg ); } mca->CaptureErrorData( cd, Util::hashString("MaintCmdRegs_mca") ); } #undef PRDF_FUNC } template<> void __recaptureRegs( STEP_CODE_DATA_STRUCT & io_sc, ExtensibleChip * i_chip ) { #define PRDF_FUNC "[__recaptureRegs] " RegDataCache & cache = RegDataCache::getCachedRegisters(); ExtensibleChip * membChip = getConnectedParent( i_chip, TYPE_MEMBUF ); TargetHandle_t mbaTrgt = i_chip->GetChipHandle(); uint32_t mbaPos = getTargetPosition( mbaTrgt ); const char * membRegs[2][15] = { { "MBSECCFIR_0", "MBA0_MBSECCERRPT_0","MBA0_MBSECCERRPT_1", "MBA0_MBSEC0", "MBA0_MBSEC1", "MBSTR_0", "MBA0_MBSSYMEC0", "MBA0_MBSSYMEC1", "MBA0_MBSSYMEC2", "MBA0_MBSSYMEC3", "MBA0_MBSSYMEC4", "MBA0_MBSSYMEC5", "MBA0_MBSSYMEC6", "MBA0_MBSSYMEC7", "MBA0_MBSSYMEC8", }, { "MBSECCFIR_1", "MBA1_MBSECCERRPT_0","MBA1_MBSECCERRPT_1", "MBA1_MBSEC0", "MBA1_MBSEC1", "MBSTR_1", "MBA1_MBSSYMEC0", "MBA1_MBSSYMEC1", "MBA1_MBSSYMEC2", "MBA1_MBSSYMEC3", "MBA1_MBSSYMEC4", "MBA1_MBSSYMEC5", "MBA1_MBSSYMEC6", "MBA1_MBSSYMEC7", "MBA1_MBSSYMEC8", }, }; for ( uint32_t i = 0; i < 15; i++ ) { SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister( membRegs[mbaPos][i] ); cache.flush( membChip, reg ); } const char * mbaRegs[] = { "MBASPA", "MBMCT", "MBMSR", "MBMACA", "MBMEA", "MBASCTL", "MBAECTL", }; for ( uint32_t i = 0; i < sizeof(mbaRegs)/sizeof(char*); i++ ) { SCAN_COMM_REGISTER_CLASS * reg = i_chip->getRegister( mbaRegs[i] ); cache.flush( i_chip, reg ); } // Now recapture those registers. CaptureData & cd = io_sc.service_data->GetCaptureData(); if ( 0 == mbaPos ) { membChip->CaptureErrorData(cd, Util::hashString("MaintCmdRegs_mba0") ); } else { membChip->CaptureErrorData(cd, Util::hashString("MaintCmdRegs_mba1") ); } i_chip->CaptureErrorData(cd, Util::hashString("MaintCmdRegs")); #undef PRDF_FUNC } //------------------------------------------------------------------------------ template uint32_t MemTdCtlr::handleTdEvent( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr::handleTdEvent] " uint32_t o_rc = SUCCESS; do { // Make sure the TD controller is initialized. o_rc = initialize(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "initialize() failed on 0x%08x", iv_chip->getHuid() ); break; } // Don't interrupt a TD procedure if one is already in progress. if ( nullptr != iv_curProcedure ) break; // If the queue is empty, there is nothing to do. So there is no point // to stopping background scrub. This could have happen if TPS was // banned on a rank and the TPS request was never added to the queue. In // that case, mask fetch attentions temporarily to prevent flooding. if ( iv_queue.empty() ) { o_rc = maskEccAttns(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "maskEccAttns() failed" ); break; } break; // Don't stop background scrub. } // Stop background scrubbing. o_rc = stopBgScrub( iv_chip ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "stopBgScrub(0x%08x) failed", iv_chip->getHuid() ); break; } // Since we had to manually stop the maintenance command, refresh all // relevant registers that may have changed since the initial capture. __recaptureRegs( io_sc, iv_chip ); collectStateCaptureData( io_sc, TD_CTLR_DATA::START ); // It is possible that background scrub could have found an ECC error // before we had a chance to stop the command. Therefore, we need to // call analyzeCmdComplete() first so that any ECC errors found can be // handled. Also, analyzeCmdComplete() will initialize the variables // needed so we know where to restart background scrubbing. bool junk = false; o_rc = analyzeCmdComplete( junk, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "analyzeCmdComplete(0x%08x) failed", iv_chip->getHuid() ); break; } // Move onto the next step in the state machine. o_rc = nextStep( io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "nextStep() failed on 0x%08x", iv_chip->getHuid() ); break; } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template <> uint32_t MemTdCtlr::initialize() { #define PRDF_FUNC "[MemTdCtlr::initialize] " uint32_t o_rc = SUCCESS; do { if ( iv_initialized ) break; // nothing to do // Add any unverified chip marks to the TD queue. std::vector vectorList = iv_rankList.getList(); for ( auto & entry : vectorList ) { ExtensibleChip * mbaChip = entry.getChip(); MemRank rank = entry.getRank(); // Call readChipMark to get MemMark. MemMark chipMark; o_rc = MarkStore::readChipMark( mbaChip, rank, chipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,%d) " "failed", mbaChip->getHuid(), rank.getMaster() ); break; } if ( !chipMark.isValid() ) continue; // no chip mark present // Get the DQ Bitmap data. TargetHandle_t mbaTrgt = mbaChip->GetChipHandle(); MemDqBitmap dqBitmap; o_rc = getBadDqBitmap(mbaTrgt, rank, dqBitmap); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap" "(0x%08x, %d)", getHuid(mbaTrgt), rank.getMaster() ); break; } // Check if the chip mark is verified or not. bool cmVerified = false; o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark() failed." ); break; } // If the chip mark is unverified, add a VcmEvent to the TD queue if ( !cmVerified ) { TdEntry * vcmEntry = new VcmEvent( mbaChip, rank, chipMark ); iv_queue.push( vcmEntry ); } } // At this point, the TD controller is initialized. iv_initialized = true; }while(0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template <> uint32_t MemTdCtlr::initialize() { #define PRDF_FUNC "[MemTdCtlr::initialize] " uint32_t o_rc = SUCCESS; do { if ( iv_initialized ) break; // nothing to do // Add any unverified chip marks to the TD queue. std::vector vectorList = iv_rankList.getList(); for ( auto & entry : vectorList ) { ExtensibleChip * mcaChip = entry.getChip(); MemRank rank = entry.getRank(); // Call readChipMark to get MemMark. MemMark chipMark; o_rc = MarkStore::readChipMark( mcaChip, rank, chipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,%d) " "failed", mcaChip->getHuid(), rank.getMaster() ); break; } if ( !chipMark.isValid() ) continue; // no chip mark present // Get the DQ Bitmap data. TargetHandle_t mcaTrgt = mcaChip->GetChipHandle(); MemDqBitmap dqBitmap; o_rc = getBadDqBitmap(mcaTrgt, rank, dqBitmap); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap" "(0x%08x, %d)", getHuid(mcaTrgt), rank.getMaster() ); break; } // Check if the chip mark is verified or not. bool cmVerified = false; o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark() failed." ); break; } // If the chip mark is unverified, add a VcmEvent to the TD queue if ( !cmVerified ) { TdEntry * vcmEntry = new VcmEvent( mcaChip, rank, chipMark ); iv_queue.push( vcmEntry ); } } // At this point, the TD controller is initialized. iv_initialized = true; }while(0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template uint32_t MemTdCtlr::defaultStep( STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemTdCtlr::defaultStep] " uint32_t o_rc = SUCCESS; if ( iv_resumeBgScrub ) { // Background scrubbing paused for FFDC collection only. Resume the // current command. iv_resumeBgScrub = false; PRDF_TRAC( PRDF_FUNC "Calling resumeBgScrub(0x%08x)", iv_chip->getHuid() ); o_rc = resumeBgScrub( iv_chip ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "resumeBgScrub(0x%08x) failed", iv_chip->getHuid() ); } } else { // Unmask the ECC attentions that were explicitly masked during the // TD procedure. o_rc = unmaskEccAttns(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "unmaskEccAttns() failed" ); } // A TD procedure has completed. Restart background scrubbing on the // next rank. TdRankListEntry nextRank = iv_rankList.getNext( iv_stoppedRank ); PRDF_TRAC( PRDF_FUNC "Calling startBgScrub(0x%08x, m%ds%d)", nextRank.getChip()->getHuid(), nextRank.getRank().getMaster(), nextRank.getRank().getSlave() ); o_rc = startBgScrub( nextRank.getChip(), nextRank.getRank() ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "startBgScrub(0x%08x,m%ds%d) failed", nextRank.getChip()->getHuid(), nextRank.getRank().getMaster(), nextRank.getRank().getSlave() ); } } return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template uint32_t __handleNceEte( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, STEP_CODE_DATA_STRUCT & io_sc, bool i_isHard = false ) { #define PRDF_FUNC "[__handleNceEte] " uint32_t o_rc = SUCCESS; MemRank rank = i_addr.getRank(); do { // Query the per-symbol counters for the CE symbol(s). MemUtils::MaintSymbols symData; MemSymbol junk; o_rc = MemUtils::collectCeStats( i_chip, rank, symData, junk ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "MemUtils::collectCeStats(0x%08x,m%ds%d) " "failed", i_chip->getHuid(), rank.getMaster(), rank.getSlave() ); break; } // Make sure the list size is correct. Note that Nimbus has two symbol // correction. So it is possible to have two symbols in the counters // even though the threshold is set to 1. uint32_t count = symData.size(); switch ( T ) { case TYPE_MCA: PRDF_ASSERT( 1 <= count && count <= 2 ); break; case TYPE_MBA: PRDF_ASSERT( 1 == count ); break; default: PRDF_ASSERT( false ); } for ( auto & d : symData ) { // Add the symbol(s) to the callout list and CE table. bool doTps; o_rc = MemEcc::handleMemCe( i_chip, i_addr, d.symbol, doTps, io_sc, i_isHard ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemCe(0x%08x) failed", i_chip->getHuid() ); break; } // Add a TPS procedure to the queue, if needed. if ( doTps ) { TdEntry * e = new TpsEvent{ i_chip, rank }; io_queue.push( e ); } } if ( SUCCESS != o_rc ) break; } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); template<> uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__handleRceEte] " uint32_t o_rc = SUCCESS; // Should only get this attention in MNFG mode. PRDF_ASSERT( mfgMode() ); do { // The RCE ETE attention could be from IUE, IMPE, or IRCD. Need to check // MCAECCFIR[37] to determine if there was at least one IUE. SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister( "MCAECCFIR" ); o_rc = fir->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MCAECCFIR: i_chip=0x%08x", i_chip->getHuid() ); break; } if ( !fir->IsBitSet(37) ) break; // nothing else to do // Handle the IUE. o_errorsFound = true; io_sc.service_data->AddSignatureList( i_chip->getTrgt(), PRDFSIG_MaintIUE ); o_rc = MemEcc::handleMemIue( i_chip, i_rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "analyzeMaintIue(0x%08x) failed", i_chip->getHuid() ); break; } } while (0); return o_rc; #undef PRDF_FUNC } template<> uint32_t __handleRceEte( ExtensibleChip * i_chip, TdQueue & io_queue, const MemRank & i_rank, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__handleRceEte] " uint32_t o_rc = SUCCESS; TargetHandle_t trgt = i_chip->getTrgt(); o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintRETRY_CTE ); // Add the rank to the callout list. MemoryMru mm { trgt, i_rank, MemoryMruData::CALLOUT_RANK }; io_sc.service_data->SetCallout( mm ); do { bool doTps = true; if ( mfgMode() ) { ExtensibleChip * membChip = getConnectedParent(i_chip, TYPE_MEMBUF); // Get the current RCE count from hardware. const char * reg_str = (0 == i_chip->getPos()) ? "MBA0_MBSEC1" : "MBA1_MBSEC1"; SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister( reg_str ); o_rc = reg->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on %s", reg_str ); break; } uint16_t count = reg->GetBitFieldJustified( 0, 12 ); // Add the count to RCE table. doTps = getMbaDataBundle(i_chip)->iv_rceTable.addEntry( i_rank, io_sc, count ); } else { // The RCE threshold was set to the maximum. If we hit this then // there is definitely a problem. io_sc.service_data->setServiceCall(); } // Add a TPS procedure to the queue, if needed. if ( doTps ) { io_queue.push( new TpsEvent(i_chip, i_rank) ); } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[__checkEcc] " PRDF_ASSERT( nullptr != i_chip ); PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; o_errorsFound = false; TargetHandle_t trgt = i_chip->getTrgt(); HUID huid = i_chip->getHuid(); MemRank rank = i_addr.getRank(); do { // Check for ECC errors. uint32_t eccAttns = 0; o_rc = checkEccFirs( i_chip, eccAttns ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "checkEccFirs(0x%08x) failed", huid ); break; } if ( 0 != (eccAttns & MAINT_INT_NCE_ETE) ) { o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintINTER_CTE); o_rc = __handleNceEte( i_chip, io_queue, i_addr, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleNceEte(0x%08x) failed", huid ); break; } } if ( 0 != (eccAttns & MAINT_SOFT_NCE_ETE) ) { o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintSOFT_CTE ); o_rc = __handleNceEte( i_chip, io_queue, i_addr, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleNceEte(0x%08x) failed", huid ); break; } } if ( 0 != (eccAttns & MAINT_HARD_NCE_ETE) ) { o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintHARD_CTE ); o_rc = __handleNceEte( i_chip, io_queue, i_addr, io_sc, true ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleNceEte(0x%08x) failed", huid ); break; } // Any hard CEs in MNFG should be immediately reported. // NOTE: We will only use the MNFG thresholds if DRAM repairs is // disabled. This is for a Nimbus DD2.0.1 workaround, but the // change will be permanent for all P9 DD levels. if ( areDramRepairsDisabled() ) { io_sc.service_data->setSignature( huid, PRDFSIG_MaintHARD_CTE ); io_sc.service_data->setServiceCall(); } } if ( 0 != (eccAttns & MAINT_MPE) ) { o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintMPE ); o_rc = MemEcc::handleMpe( i_chip, i_addr, UE_TABLE::SCRUB_MPE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMpe(0x%08x, 0x%02x) failed", i_chip->getHuid(), rank.getKey() ); break; } } if ( 0 != (eccAttns & MAINT_RCE_ETE) ) { o_rc = __handleRceEte( i_chip, io_queue, rank, o_errorsFound, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "__handleRceEte(0x%08x) failed", huid ); break; } } if ( 0 != (eccAttns & MAINT_UE) ) { o_errorsFound = true; io_sc.service_data->AddSignatureList( trgt, PRDFSIG_MaintUE ); // Since this will be a predictive callout, change the primary // signature as well. io_sc.service_data->setSignature( huid, PRDFSIG_MaintUE ); // Add the rank to the callout list. o_rc = MemEcc::handleMemUe( i_chip, i_addr, UE_TABLE::SCRUB_UE, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemUe(0x%08x) failed", i_chip->getHuid() ); break; } // Add a TPS procedure to the queue. TdEntry * e = new TpsEvent{ i_chip, rank, true }; io_queue.push( e ); } } while (0); return o_rc; #undef PRDF_FUNC } template uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ); template<> uint32_t __checkEcc( ExtensibleChip * i_chip, TdQueue & io_queue, const MemAddr & i_addr, bool & o_errorsFound, STEP_CODE_DATA_STRUCT & io_sc ) { // TODO: remove this once runtime support is abled for MBA. return SUCCESS; } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::maskEccAttns() { #define PRDF_FUNC "[MemTdCtlr::maskEccAttns] " uint32_t o_rc = SUCCESS; // Loop through all MCAs. for ( auto mcaChip : getConnected(iv_chip, TYPE_MCA) ) { SCAN_COMM_REGISTER_CLASS * mask = mcaChip->getRegister( "MCAECCFIR_MASK_OR" ); mask->clearAllBits(); mask->SetBit(8); // Mainline read NCE mask->SetBit(9); // Mainline read TCE o_rc = mask->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on MCAECCFIR_MASK_OR" ); break; } } return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::unmaskEccAttns() { #define PRDF_FUNC "[MemTdCtlr::unmaskEccAttns] " uint32_t o_rc = SUCCESS; // Memory CEs were masked at the beginning of the TD procedure, so // clear and unmask them. Also, it is possible that memory UEs have // thresholded so clear and unmask them as well. // Loop through all MCAs. for ( auto mcaChip : getConnected(iv_chip, TYPE_MCA) ) { SCAN_COMM_REGISTER_CLASS * fir = mcaChip->getRegister( "MCAECCFIR_AND" ); SCAN_COMM_REGISTER_CLASS * mask = mcaChip->getRegister( "MCAECCFIR_MASK_AND" ); fir->setAllBits(); mask->setAllBits(); // Do not unmask NCE and TCE attentions if they have been permanently // masked due to certain TPS conditions. if ( !(getMcaDataBundle(mcaChip)->iv_maskMainlineNceTce) ) { fir->ClearBit(8); mask->ClearBit(8); // Mainline read NCE fir->ClearBit(9); mask->ClearBit(9); // Mainline read TCE } fir->ClearBit(14); mask->ClearBit(14); // Mainline read UE o_rc = fir->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on MCAECCFIR_AND" ); break; } o_rc = mask->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on MCAECCFIR_MASK_AND" ); break; } } return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::maskEccAttns() { #define PRDF_FUNC "[MemTdCtlr::maskEccAttns] " uint32_t o_rc = SUCCESS; do { // Don't want to handle memory CEs during any TD procedures, so // mask them. const char * reg_str = (0 == iv_chip->getPos()) ? "MBSECCFIR_0_MASK_OR" : "MBSECCFIR_1_MASK_OR"; ExtensibleChip * membChip = getConnectedParent( iv_chip, TYPE_MEMBUF ); SCAN_COMM_REGISTER_CLASS * reg = membChip->getRegister(reg_str); reg->clearAllBits(); reg->SetBit(16); // fetch NCE reg->SetBit(17); // fetch RCE reg->SetBit(43); // prefetch UE o_rc = reg->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on %s", reg_str ); break; } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::unmaskEccAttns() { #define PRDF_FUNC "[MemTdCtlr::unmaskEccAttns] " uint32_t o_rc = SUCCESS; do { // Memory CEs where masked at the beginning of the TD procedure, so // clear and unmask them. Also, it is possible that memory UEs have // thresholded so clear and unmask them as well. const char * fir_str = (0 == iv_chip->getPos()) ? "MBSECCFIR_0_AND" : "MBSECCFIR_1_AND"; const char * msk_str = (0 == iv_chip->getPos()) ? "MBSECCFIR_0_MASK_AND" : "MBSECCFIR_1_MASK_AND"; ExtensibleChip * membChip = getConnectedParent( iv_chip, TYPE_MEMBUF ); SCAN_COMM_REGISTER_CLASS * fir = membChip->getRegister( fir_str ); SCAN_COMM_REGISTER_CLASS * msk = membChip->getRegister( msk_str ); fir->setAllBits(); msk->setAllBits(); fir->ClearBit(16); msk->ClearBit(16); // fetch NCE fir->ClearBit(17); msk->ClearBit(17); // fetch RCE fir->ClearBit(19); msk->ClearBit(19); // fetch UE fir->ClearBit(43); msk->ClearBit(43); // prefetch UE o_rc = fir->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on %s", fir_str ); break; } o_rc = msk->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on %s", msk_str ); break; } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::handleRrFo() { #define PRDF_FUNC "[MemTdCtlr::handleRrFo] " uint32_t o_rc = SUCCESS; do { // Check if maintenance command complete attention is set. SCAN_COMM_REGISTER_CLASS * mcbistfir = iv_chip->getRegister("MCBISTFIR"); o_rc = mcbistfir->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MCBISTFIR"); break; } // If there is a command complete attention, nothing to do, break out. if ( mcbistfir->IsBitSet(10) || mcbistfir->IsBitSet(12) ) break; // Check if a command is not running. // If bit 0 of MCB_CNTLSTAT is on, a mcbist run is in progress. SCAN_COMM_REGISTER_CLASS * mcb_cntlstat = iv_chip->getRegister("MCB_CNTLSTAT"); o_rc = mcb_cntlstat->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MCB_CNTLSTAT" ); break; } // If a command is not running, set command complete attn, break. if ( !mcb_cntlstat->IsBitSet(0) ) { SCAN_COMM_REGISTER_CLASS * mcbistfir_or = iv_chip->getRegister("MCBISTFIR_OR"); mcbistfir_or->SetBit( 10 ); mcbistfir_or->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on MCBISTFIR_OR" ); } break; } // Check if there are unverified chip marks. std::vector vectorList = iv_rankList.getList(); for ( auto & entry : vectorList ) { ExtensibleChip * mcaChip = entry.getChip(); MemRank rank = entry.getRank(); // Get the chip mark MemMark chipMark; o_rc = MarkStore::readChipMark( mcaChip, rank, chipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,%d) " "failed", mcaChip->getHuid(), rank.getMaster() ); break; } if ( !chipMark.isValid() ) continue; // no chip mark present // Get the DQ Bitmap data. TargetHandle_t mcaTrgt = mcaChip->GetChipHandle(); MemDqBitmap dqBitmap; o_rc = getBadDqBitmap(mcaTrgt, rank, dqBitmap); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap" "(0x%08x, %d)", getHuid(mcaTrgt), rank.getMaster() ); break; } // Check if the chip mark is verified or not. bool cmVerified = false; o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark failed." ); break; } // If there are any unverified chip marks, stop the command, break. if ( !cmVerified ) { o_rc = stopBgScrub( iv_chip ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "stopBgScrub(0x%08x) " "failed", iv_chip->getHuid() ); } break; } } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::handleRrFo() { #define PRDF_FUNC "[MemTdCtlr::handleRrFo] " uint32_t o_rc = SUCCESS; do { // Check if maintenance command complete attention is set. SCAN_COMM_REGISTER_CLASS * mbaspa = iv_chip->getRegister("MBASPA"); o_rc = mbaspa->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MBASPA"); break; } // If there is a command complete attention, nothing to do, break out. if ( mbaspa->IsBitSet(0) || mbaspa->IsBitSet(8) ) break; // Check if a maintenance command is running currently. SCAN_COMM_REGISTER_CLASS * mbmsr = iv_chip->getRegister("MBMSR"); o_rc = mbmsr->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MBMSR"); break; } // If a command is not running, set command complete attn, break. if ( !mbmsr->IsBitSet(0) ) { SCAN_COMM_REGISTER_CLASS * mbaspa_or = iv_chip->getRegister("MBASPA_OR"); mbaspa_or->SetBit( 0 ); mbaspa_or->Write(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Write() failed on MBASPA_OR" ); } break; } // Check if there are unverified chip marks. std::vector vectorList = iv_rankList.getList(); for ( auto & entry : vectorList ) { ExtensibleChip * mbaChip = entry.getChip(); MemRank rank = entry.getRank(); // Get the chip mark MemMark chipMark; o_rc = MarkStore::readChipMark( mbaChip, rank, chipMark ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "readChipMark(0x%08x,%d) " "failed", mbaChip->getHuid(), rank.getMaster() ); break; } if ( !chipMark.isValid() ) continue; // no chip mark present // Get the DQ Bitmap data. TargetHandle_t mbaTrgt = mbaChip->GetChipHandle(); MemDqBitmap dqBitmap; o_rc = getBadDqBitmap(mbaTrgt, rank, dqBitmap); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getBadDqBitmap" "(0x%08x, %d)", getHuid(mbaTrgt), rank.getMaster() ); break; } // Check if the chip mark is verified or not. bool cmVerified = false; o_rc = dqBitmap.isChipMark( chipMark.getSymbol(), cmVerified ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "dqBitmap.isChipMark failed." ); break; } // If there are any unverified chip marks, stop the command, break. if ( !cmVerified ) { o_rc = stopBgScrub( iv_chip ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "stopBgScrub(0x%08x) failed", iv_chip->getHuid() ); } break; } } } while (0); return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ template<> uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) { #define PRDF_FUNC "[MemTdCtlr::canResumeBgScrub] " uint32_t o_rc = SUCCESS; o_canResume = false; // It is possible that we were running a TD procedure and the PRD service // was reset. Therefore, we must check if background scrubbing was actually // configured. There really is not a good way of doing this. A scrub command // is a scrub command the only difference is the speed. Unfortunately, that // speed can change depending on how the hardware team tunes it. For now, we // can use the stop conditions, which should be unique for background scrub, // to determine if it has been configured. SCAN_COMM_REGISTER_CLASS * reg = iv_chip->getRegister( "MBSTR" ); o_rc = reg->Read(); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "Read() failed on MBSTR: iv_chip=0x%08x", iv_chip->getHuid() ); } else if ( 0xf != reg->GetBitFieldJustified(0,4) && // NCE int TH 0xf != reg->GetBitFieldJustified(4,4) && // NCE soft TH 0xf != reg->GetBitFieldJustified(8,4) && // NCE hard TH reg->IsBitSet(34) && // pause on MPE reg->IsBitSet(35) ) // pause on UE { o_canResume = true; } return o_rc; #undef PRDF_FUNC } template<> uint32_t MemTdCtlr::canResumeBgScrub( bool & o_canResume ) { #define PRDF_FUNC "[MemTdCtlr::canResumeBgScrub] " uint32_t o_rc = SUCCESS; o_canResume = false; // It is possible that we were running a TD procedure and the PRD service // was reset. Assuming the command did not stop on the last address of the // current slave rank, we will simply "resume" the command from the next // address to the end of the rank. The MBA resume actually starts a new // command, unlike MCBIST. Therefore, we can get away with blindly starting // the command without trying to assess what type of command was actually // running. bool lastAddr = false; o_rc = didCmdStopOnLastAddr( iv_chip, SLAVE_RANK, lastAddr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "didCmdStopOnLastAddr(0x%08x) failed", iv_chip->getHuid() ); } else { o_canResume = !lastAddr; } return o_rc; #undef PRDF_FUNC } //------------------------------------------------------------------------------ // Avoid linker errors with the template. template class MemTdCtlr; template class MemTdCtlr; //------------------------------------------------------------------------------ } // end namespace PRDF