diff options
Diffstat (limited to 'src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C')
-rw-r--r-- | src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C | 452 |
1 files changed, 440 insertions, 12 deletions
diff --git a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C index 9869a8c08..f206a074e 100644 --- a/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C +++ b/src/usr/diag/prdf/common/plat/mem/prdfMemEccAnalysis.C @@ -127,6 +127,87 @@ uint32_t handleMemUe<TYPE_MCA>( ExtensibleChip * i_chip, const MemAddr & i_addr, i_chip->getHuid(), i_type ); break; } + + #ifdef __HOSTBOOT_RUNTIME + // Increment the UE counter and store the rank we're on, resetting + // the UE and CE counts if we have stopped on a new rank. + ExtensibleChip * mcb = getConnectedParent( i_chip, TYPE_MCBIST ); + McbistDataBundle * mcbdb = getMcbistDataBundle(mcb); + if ( mcbdb->iv_ceUeRank != i_addr.getRank() ) + { + mcbdb->iv_ceStopCounter.reset(); + mcbdb->iv_ueStopCounter.reset(); + } + mcbdb->iv_ueStopCounter.inc( io_sc ); + mcbdb->iv_ceUeRank = i_addr.getRank(); + #endif + } + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + +template<> +uint32_t handleMemUe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + UE_TABLE::Type i_type, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + #define PRDF_FUNC "[MemEcc::handleMemUe<TYPE_OCMB_CHIP>] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // First check to see if this is a side-effect UE. + SCAN_COMM_REGISTER_CLASS * fir = i_chip->getRegister("OCMB_LFIR"); + o_rc = fir->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on OCMB_LFIR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + // Check OCMB_LFIR[38] to determine if this is a side-effect. + if ( fir->IsBitSet(38) ) + { + // This is a side-effect. Callout the OCMB. + PRDF_TRAC( PRDF_FUNC "Memory UE is side-effect of DDRPHY error" ); + io_sc.service_data->SetCallout( i_chip->getTrgt() ); + io_sc.service_data->setServiceCall(); + } + else + { + // Handle the memory UE. + o_rc = __handleMemUe<TYPE_OCMB_CHIP>( i_chip, i_addr, i_type, + io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "__handleMemUe(0x%08x,%d) failed", + i_chip->getHuid(), i_type ); + break; + } + + #ifdef __HOSTBOOT_RUNTIME + // Increment the UE counter and store the rank we're on, resetting + // the UE and CE counts if we have stopped on a new rank. + OcmbDataBundle * ocmbdb = getOcmbDataBundle(i_chip); + if ( ocmbdb->iv_ceUeRank != i_addr.getRank() ) + { + ocmbdb->iv_ceStopCounter.reset(); + ocmbdb->iv_ueStopCounter.reset(); + } + ocmbdb->iv_ueStopCounter.inc( io_sc ); + ocmbdb->iv_ceUeRank = i_addr.getRank(); + #endif + } } while (0); @@ -328,6 +409,52 @@ uint32_t maskMemPort<TYPE_MCA>( ExtensibleChip * i_chip ) #undef PRDF_FUNC } +template<> +uint32_t maskMemPort<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[MemEcc::maskMemPort<TYPE_OCMB_CHIP>] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // Mask all FIRs on the OCMB in the chiplet FIRs. + SCAN_COMM_REGISTER_CLASS * chipletMask = + i_chip->getRegister("OCMB_CHIPLET_FIR_MASK"); + SCAN_COMM_REGISTER_CLASS * chipletSpaMask = + i_chip->getRegister("OCMB_CHIPLET_SPA_FIR_MASK"); + + chipletMask->setAllBits(); + chipletSpaMask->setAllBits(); + + o_rc = chipletMask->Write() | chipletSpaMask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", i_chip->getHuid() ); + break; + } + + #ifdef __HOSTBOOT_RUNTIME + + // Dynamically deallocate the port. + if ( SUCCESS != MemDealloc::port<TYPE_OCMB_CHIP>( i_chip ) ) + { + PRDF_ERR( PRDF_FUNC "MemDealloc::port<TYPE_OCMB_CHIP>(0x%08x) " + "failed", i_chip->getHuid() ); + } + + #endif + + } while (0); + + return o_rc; + + #undef PRDF_FUNC +} + #endif // __HOSTBOOT_MODULE //------------------------------------------------------------------------------ @@ -390,6 +517,62 @@ uint32_t triggerPortFail<TYPE_MCA>( ExtensibleChip * i_chip ) #undef PRDF_FUNC } +template<> +uint32_t triggerPortFail<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip ) +{ + #define PRDF_FUNC "[MemEcc::triggerPortFail<TYPE_OCMB_CHIP>] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + + do + { + // trigger a port fail + // set FARB0[59] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_CONSTANT and + // FARB0[40] - MBA_FARB0Q_CFG_INJECT_PARITY_ERR_ADDR5 + SCAN_COMM_REGISTER_CLASS * farb0 = i_chip->getRegister("FARB0"); + + o_rc = farb0->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() FARB0 failed: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + farb0->SetBit(59); + farb0->SetBit(40); + + o_rc = farb0->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() FARB0 failed: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + // reset thresholds to prevent issuing multiple port failures on + // the same port + for ( auto & resetTh : db->iv_iueTh ) + { + resetTh.second.reset(); + } + + db->iv_iuePortFail = true; + + break; + }while(0); + + + return o_rc; + + #undef PRDF_FUNC +} + #endif // __HOSTBOOT_RUNTIME //------------------------------------------------------------------------------ @@ -420,6 +603,30 @@ bool queryIueTh<TYPE_MCA>( ExtensibleChip * i_chip, return iueAtTh; } +template<> +bool queryIueTh<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + bool iueAtTh = false; + + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + + // Loop through all our thresholds + for ( auto & th : db->iv_iueTh ) + { + // If threshold reached + if ( th.second.thReached(io_sc) ) + { + iueAtTh = true; + } + } + + return iueAtTh; +} + #endif //------------------------------------------------------------------------------ @@ -493,6 +700,11 @@ template uint32_t handleMpe<TYPE_MBA>( ExtensibleChip * i_chip, const MemAddr & i_addr, UE_TABLE::Type i_type, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t handleMpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemAddr & i_addr, + UE_TABLE::Type i_type, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -581,6 +793,10 @@ template uint32_t analyzeFetchMpe<TYPE_MBA>( ExtensibleChip * i_chip, const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeFetchMpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemRank & i_rank, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -794,6 +1010,9 @@ uint32_t analyzeFetchNceTce<TYPE_MCA>( ExtensibleChip * i_chip, template uint32_t analyzeFetchNceTce<TYPE_MBA>( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeFetchNceTce<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -871,6 +1090,9 @@ uint32_t analyzeFetchUe<TYPE_MCA>( ExtensibleChip * i_chip, template uint32_t analyzeFetchUe<TYPE_MBA>( ExtensibleChip * i_chip, STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeFetchUe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); //------------------------------------------------------------------------------ @@ -955,16 +1177,97 @@ uint32_t handleMemIue<TYPE_MCA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } -//------------------------------------------------------------------------------ - template<> -uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip, +uint32_t handleMemIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + const MemRank & i_rank, STEP_CODE_DATA_STRUCT & io_sc ) { + #define PRDF_FUNC "[MemEcc::handleMemIue] " + + PRDF_ASSERT( nullptr != i_chip ); + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + // Add the DIMM to the callout list. + MemoryMru mm { i_chip->getTrgt(), i_rank, MemoryMruData::CALLOUT_RANK }; + io_sc.service_data->SetCallout( mm ); + + #ifdef __HOSTBOOT_MODULE + + do + { + // Nothing else to do if handling a system checkstop. + if ( CHECK_STOP == io_sc.service_data->getPrimaryAttnType() ) break; + + // Get the data bundle from chip. + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + + // If we have already caused a port fail, mask the IUE bits. + if ( true == db->iv_iuePortFail ) + { + SCAN_COMM_REGISTER_CLASS * mask_or = + i_chip->getRegister("RDFFIR_MASK_OR"); + + mask_or->SetBit(17); + mask_or->SetBit(37); + + o_rc = mask_or->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on 0x%08x", + i_chip->getHuid() ); + break; + } + } + + // Get the DIMM select. + uint8_t ds = i_rank.getDimmSlct(); + + // Initialize threshold if it doesn't exist yet. + if ( 0 == db->iv_iueTh.count(ds) ) + { + db->iv_iueTh[ds] = TimeBasedThreshold( getIueTh() ); + } + + // Increment the count and check if at threshold. + if ( db->iv_iueTh[ds].inc(io_sc) ) + { + // Make the error log predictive. + io_sc.service_data->setServiceCall(); + + // The port fail will be triggered in the PostAnalysis plugin after + // the error log has been committed. + + // Mask off the entire port to avoid collateral. + o_rc = MemEcc::maskMemPort<TYPE_OCMB_CHIP>( i_chip ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "MemEcc::maskMemPort(0x%08x) failed", + i_chip->getHuid() ); + break; + } + } + + } while (0); + + #endif // __HOSTBOOT_MODULE + + return o_rc; + + #undef PRDF_FUNC +} + +//------------------------------------------------------------------------------ + +template<TARGETING::TYPE T> +uint32_t analyzeMainlineIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ #define PRDF_FUNC "[MemEcc::analyzeMainlineIue] " PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -974,7 +1277,7 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip, // not likely that we will have two independent failure modes at the // same time. So we just assume the address is correct. MemAddr addr; - o_rc = getMemReadAddr<TYPE_MCA>( i_chip, MemAddr::READ_RCE_ADDR, addr ); + o_rc = getMemReadAddr<T>( i_chip, MemAddr::READ_RCE_ADDR, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemReadAddr(0x%08x, READ_RCE_ADDR) failed", @@ -983,7 +1286,7 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip, } MemRank rank = addr.getRank(); - o_rc = handleMemIue<TYPE_MCA>( i_chip, rank, io_sc ); + o_rc = handleMemIue<T>( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", @@ -998,16 +1301,23 @@ uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template +uint32_t analyzeMainlineIue<TYPE_MCA>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeMainlineIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ -template<> -uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip, - STEP_CODE_DATA_STRUCT & io_sc ) +template<TARGETING::TYPE T> +uint32_t analyzeMaintIue( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) { #define PRDF_FUNC "[MemEcc::analyzeMaintIue] " PRDF_ASSERT( nullptr != i_chip ); - PRDF_ASSERT( TYPE_MCA == i_chip->getType() ); + PRDF_ASSERT( T == i_chip->getType() ); uint32_t o_rc = SUCCESS; @@ -1015,7 +1325,7 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip, { // Use the current address in the MCBMCAT. MemAddr addr; - o_rc = getMemMaintAddr<TYPE_MCA>( i_chip, addr ); + o_rc = getMemMaintAddr<T>( i_chip, addr ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "getMemMaintAddr(0x%08x) failed", @@ -1024,7 +1334,7 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip, } MemRank rank = addr.getRank(); - o_rc = handleMemIue<TYPE_MCA>( i_chip, rank, io_sc ); + o_rc = handleMemIue<T>( i_chip, rank, io_sc ); if ( SUCCESS != o_rc ) { PRDF_ERR( PRDF_FUNC "handleMemIue(0x%08x,m%ds%d) failed", @@ -1039,6 +1349,13 @@ uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template +uint32_t analyzeMaintIue<TYPE_MCA>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); +template +uint32_t analyzeMaintIue<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ); + //------------------------------------------------------------------------------ template<> @@ -1152,6 +1469,117 @@ uint32_t analyzeImpe<TYPE_MCA>( ExtensibleChip * i_chip, #undef PRDF_FUNC } +template<> +uint32_t analyzeImpe<TYPE_OCMB_CHIP>( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & io_sc ) +{ + + #define PRDF_FUNC "[MemEcc::analyzeImpe] " + + PRDF_ASSERT( TYPE_OCMB_CHIP == i_chip->getType() ); + + uint32_t o_rc = SUCCESS; + + do + { + // get the mark shadow register + SCAN_COMM_REGISTER_CLASS * msr = i_chip->getRegister("EXP_MSR"); + + o_rc = msr->Read(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Read() failed on EXP_MSR: i_chip=0x%08x", + i_chip->getHuid() ); + break; + } + + TargetHandle_t trgt = i_chip->getTrgt(); + + // get galois field code - bits 8:15 of MSR + uint8_t galois = msr->GetBitFieldJustified( 8, 8 ); + + // get rank - bits 16:18 of MSR + uint8_t mrnk = msr->GetBitFieldJustified( 16, 3 ); + MemRank rank( mrnk ); + + // get symbol and DRAM + MemSymbol symbol = MemSymbol::fromGalois( trgt, rank, galois ); + if ( !symbol.isValid() ) + { + PRDF_ERR( PRDF_FUNC "Galois 0x%02x from EXP_MSR is invalid: 0x%08x," + "0x%02x", galois, i_chip->getHuid(), rank.getKey() ); + o_rc = FAIL; + break; + } + + // Add the DIMM to the callout list + MemoryMru memmru( trgt, rank, MemoryMruData::CALLOUT_RANK ); + io_sc.service_data->SetCallout( memmru ); + + #ifdef __HOSTBOOT_MODULE + // get data bundle from chip + OcmbDataBundle * db = getOcmbDataBundle( i_chip ); + uint8_t dram = symbol.getDram(); + + // Increment the count and check threshold. + if ( db->getImpeThresholdCounter()->inc(rank, dram, io_sc) ) + { + // Make the error log predictive if DRAM Repairs are disabled or if + // the number of DRAMs on this rank with IMPEs has reached threshold + if ( areDramRepairsDisabled() || + db->getImpeThresholdCounter()->queryDrams(rank, dram, io_sc) ) + { + io_sc.service_data->setServiceCall(); + } + else // Otherwise, place a chip mark on the failing DRAM. + { + MemMark chipMark( trgt, rank, galois ); + o_rc = MarkStore::writeChipMark<TYPE_OCMB_CHIP>( i_chip, rank, + chipMark ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "writeChipMark(0x%08x,0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + + o_rc = MarkStore::chipMarkCleanup<TYPE_OCMB_CHIP>( i_chip, rank, + io_sc ); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "chipMarkCleanup(0x%08x,0x%02x) failed", + i_chip->getHuid(), rank.getKey() ); + break; + } + } + } + + // If a predictive callout is made, mask both mainline and maintenance + // attentions. + if ( io_sc.service_data->queryServiceCall() ) + { + SCAN_COMM_REGISTER_CLASS * mask + = i_chip->getRegister( "RDFFIR_MASK_OR" ); + mask->SetBit(19); // mainline + mask->SetBit(39); // maintenance + o_rc = mask->Write(); + if ( SUCCESS != o_rc ) + { + PRDF_ERR( PRDF_FUNC "Write() failed on RDFFIR_MASK_OR: " + "0x%08x", i_chip->getHuid() ); + break; + } + } + #endif // __HOSTBOOT_MODULE + + } while (0); + + + return o_rc; + + #undef PRDF_FUNC +} + //------------------------------------------------------------------------------ template<> |