diff options
author | Dan Crowell <dcrowell@us.ibm.com> | 2017-02-22 14:50:21 -0600 |
---|---|---|
committer | Daniel M. Crowell <dcrowell@us.ibm.com> | 2017-03-13 16:11:42 -0400 |
commit | 21cd4b1c1d176338fac5015c235566e3ff10ab2b (patch) | |
tree | a7d1d2d051b5db861a56777f44378e67939c7e88 /src/usr/scom | |
parent | 86bae1c698cfef64d050b915654d9f3e03fb9ae3 (diff) | |
download | talos-hostboot-21cd4b1c1d176338fac5015c235566e3ff10ab2b.tar.gz talos-hostboot-21cd4b1c1d176338fac5015c235566e3ff10ab2b.zip |
Fill in P9-specific scom error handling
Adjusted address-specific register gathering to reflect P9
chip logic
Fixed some error handling bugs in the testcases
Added verbosity to error log parser
Change-Id: Iad274e8333adb32deacffd3cb92e40f11c48f884
RTC: 158541
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/37122
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
Diffstat (limited to 'src/usr/scom')
-rw-r--r-- | src/usr/scom/plugins/errludP_scom.H | 35 | ||||
-rw-r--r-- | src/usr/scom/scom.C | 212 | ||||
-rw-r--r-- | src/usr/scom/test/scomtest.H | 26 |
3 files changed, 198 insertions, 75 deletions
diff --git a/src/usr/scom/plugins/errludP_scom.H b/src/usr/scom/plugins/errludP_scom.H index 6a819a4ec..f3a704cf6 100644 --- a/src/usr/scom/plugins/errludP_scom.H +++ b/src/usr/scom/plugins/errludP_scom.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2016 */ +/* Contributors Listed Below - COPYRIGHT 2016,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -73,17 +73,30 @@ namespace SCOM ErrlUsrParser & i_parser, void * i_pBuffer, const uint32_t i_buflen) const - { - char* l_databuf = static_cast<char*>(i_pBuffer); - i_parser.PrintHeading("SCOM PIB ERR"); + { + char* l_databuf = static_cast<char*>(i_pBuffer); + i_parser.PrintHeading("SCOM PIB ERR"); - //***** Memory Layout ***** - // 1 bytes : Pib Error + //***** Memory Layout ***** + // 1 bytes : Pib Error - i_parser.PrintNumber("Pib Err","%.2lX", - TO_UINT8(l_databuf)); - - } + const char* l_decodeStr = "Unknown"; + uint8_t l_piberr = TO_UINT8(l_databuf); + switch(l_piberr) + { + case(0): l_decodeStr = "None"; break; + case(1): l_decodeStr = "Resource Occupied"; break; + case(2): l_decodeStr = "Chiplet Offline"; break; + case(3): l_decodeStr = "Partial Good"; break; + case(4): l_decodeStr = "Invalid Address"; break; + case(5): l_decodeStr = "Clock Error"; break; + case(6): l_decodeStr = "Parity Error"; break; + case(7): l_decodeStr = "Timeout"; break; + } + char l_outputStr[30]; + sprintf( l_outputStr, "%d (%s)", l_piberr, l_decodeStr ); + i_parser.PrintString("Pib Err", l_outputStr ); + } // Disabled UdParserPib(const UdParserPib&) = delete; @@ -92,4 +105,4 @@ namespace SCOM } -#endif
\ No newline at end of file +#endif diff --git a/src/usr/scom/scom.C b/src/usr/scom/scom.C index fb766f50d..b52c33ba0 100644 --- a/src/usr/scom/scom.C +++ b/src/usr/scom/scom.C @@ -803,7 +803,7 @@ errlHndl_t doScomOp(DeviceFW::OperationType i_opType, //Add some additional FFDC based on the specific operation if( l_err ) { - //TODO for P9 RTC 167311 addScomFailFFDC( l_err, i_target, i_addr ); + addScomFailFFDC( l_err, i_target, i_addr ); } return l_err; @@ -813,104 +813,216 @@ errlHndl_t doScomOp(DeviceFW::OperationType i_opType, /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// void addScomFailFFDC( errlHndl_t i_err, - TARGETING::Target* i_target, + TARGETING::Target* i_chipTarg, uint64_t i_addr ) { // Read some error regs from scom - ERRORLOG::ErrlUserDetailsLogRegister l_scom_data(i_target); + ERRORLOG::ErrlUserDetailsLogRegister l_scom_data(i_chipTarg); bool addit = false; TARGETING::TYPE l_type = TARGETING::TYPE_NA; - if( i_target == TARGETING::MASTER_PROCESSOR_CHIP_TARGET_SENTINEL ) + uint32_t l_badChiplet = 0x00; + + static bool l_insideFFDC = false; + if( l_insideFFDC ) + { + TRACDCOMP( g_trac_scom, "Already gathering FFDC..." ); + return; + } + l_insideFFDC = true; + + if( i_chipTarg == TARGETING::MASTER_PROCESSOR_CHIP_TARGET_SENTINEL ) { l_type = TARGETING::TYPE_PROC; } else { - l_type = i_target->getAttr<TARGETING::ATTR_TYPE>(); + l_type = i_chipTarg->getAttr<TARGETING::ATTR_TYPE>(); } - //PBA scoms on the processor - if( ((i_addr & 0xFFFFF000) == 0x00064000) + //Multicast scoms on the processor + if( p9_scom_addr(i_addr).is_multicast() && (TARGETING::TYPE_PROC == l_type) ) { addit = true; - //look for hung operations on the PBA - uint64_t ffdc_regs[] = { - //grab the PBA buffers in case something is hung - 0x02010850, //PBARBUFVAL0 - 0x02010851, //PBARBUFVAL1 - 0x02010852, //PBARBUFVAL2 - 0x02010858, //PBAWBUFVAL0 - 0x02010859, //PBAWBUFVAL1 + uint64_t ffdc_regs1[] = { + 0x000F001E, // PCBMS.FIRST_ERR_REG + 0x000F001F, // PCBMS.ERROR_REG + }; + for( size_t x = 0; + x < (sizeof(ffdc_regs1)/sizeof(ffdc_regs1[0])); + x++ ) + { + l_scom_data.addData(DEVICE_SCOM_ADDRESS(ffdc_regs1[x])); + } - 0x020F0012, //PB_GP3 (has fence information) + uint64_t ffdc_regs2[] = { + 0x000F0011, // PCBMS.REC_ERR_REG0 + 0x000F0012, // PCBMS.REC_ERR_REG1 + 0x000F0013, // PCBMS.REC_ERR_REG2 + 0x000F0014, // PCBMS.REC_ERR_REG3 }; - for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + + // save off the responses to figure out which chiplet failed + uint8_t l_responses[(sizeof(ffdc_regs2)/sizeof(ffdc_regs2[0])) + *sizeof(uint64_t)]; + uint8_t* l_respPtr = l_responses; + + for( size_t x = 0; + x < (sizeof(ffdc_regs2)/sizeof(ffdc_regs2[0])); + x++ ) { - l_scom_data.addData(DEVICE_SCOM_ADDRESS(ffdc_regs[x])); + // going to read these manually because we want to look at the data + uint64_t l_scomdata = 0; + size_t l_scomsize = sizeof(l_scomdata); + errlHndl_t l_ignored = doScomOp( DeviceFW::READ, + i_chipTarg, + &l_scomdata, + l_scomsize, + DeviceFW::SCOM, + ffdc_regs2[x] ); + if( l_ignored ) + { + delete l_ignored; + l_scomdata = 0; + } + else + { + l_scom_data.addDataBuffer( &l_scomdata, + l_scomsize, + DEVICE_SCOM_ADDRESS(ffdc_regs2[x]) ); + } + + // copy the error data into our big buffer + memcpy( l_respPtr, &l_scomdata, l_scomsize ); + l_respPtr += l_scomsize; // move to the next chunk } - } - //EX scoms on the processor (not including PCB slave regs) - else if( ((i_addr & 0xF0000000) == 0x10000000) - && ((i_addr & 0x00FF0000) != 0x000F0000) - && (TARGETING::TYPE_PROC == l_type) ) - { - addit = true; - uint64_t ex_offset = 0xFF000000 & i_addr; - //grab some data related to the PCB slave state - uint64_t ffdc_regs[] = { - 0x0F010B, //Special Wakeup - 0x0F0012, //GP3 - 0x0F0100, //PowerManagement GP0 - 0x0F0106, //PFET Status Core - 0x0F010E, //PFET Status ECO - 0x0F0111, //PM State History + + // find the bad chiplet + // 4-bits per chiplet : 1-bit response, 3-bit error code + for( size_t x = 0; x < sizeof(l_responses); x++ ) + { + // look for the first non-zero pib error code + if( l_responses[x] & 0x70 ) //front nibble + { + l_badChiplet = x*2; + } + else if( l_responses[x] & 0x07 ) //back nibble + { + l_badChiplet = x*2 + 1; + } + } + + uint64_t ffdc_regs3[] = { + 0x0F0001, // multicast group1 + 0x0F0002, // multicast group2 + 0x0F0003, // multicast group3 + 0x0F0004, // multicast group4 }; - for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + for( size_t x = 0; + x < (sizeof(ffdc_regs3)/sizeof(ffdc_regs3[0])); + x++ ) { - l_scom_data.addData(DEVICE_SCOM_ADDRESS(ex_offset|ffdc_regs[x])); + p9_scom_addr l_scom(ffdc_regs3[x]); + l_scom.set_chiplet_id(l_badChiplet); + l_scom_data.addData(DEVICE_SCOM_ADDRESS(l_scom.get_addr())); } } //Any non-PCB Slave and non TP reg on the processor - if( ((i_addr & 0x00FF0000) != 0x000F0000) - && ((i_addr & 0xFF000000) != 0x00000000) + if( ((i_addr & 0x00FF0000) != 0x000F0000) //PCB slave + && (p9_scom_addr(i_addr).get_chiplet_id() != 0x00) //TP && (TARGETING::TYPE_PROC == l_type) ) { addit = true; - uint64_t chiplet_offset = 0xFF000000 & i_addr; + if( l_badChiplet == 0x00 ) + { + l_badChiplet = p9_scom_addr(i_addr).get_chiplet_id(); + } //grab some data related to the PCB slave state uint64_t ffdc_regs[] = { - 0x0F0012, //GP3 - 0x0F001F, //Error capture reg + 0x0F001F, // PCBSL<cplt>.ERROR_REG + 0x03000F, // CC.<chiplet>.ERROR_STATUS + 0x010001, // <chiplet>.PSC.PSCOM_STATUS_ERROR_REG + 0x010002, // <chiplet>.PSC.PSCOM_ERROR_MASK }; for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) { - l_scom_data.addData( DEVICE_SCOM_ADDRESS( - chiplet_offset|ffdc_regs[x]) ); + p9_scom_addr l_scom(ffdc_regs[x]); + l_scom.set_chiplet_id(l_badChiplet); + l_scom_data.addData(DEVICE_SCOM_ADDRESS(l_scom.get_addr())); } - //grab the clock/osc regs - l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x00050019)); - l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x0005001A)); + //Osc Switch Sense 1 register + l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x0005001D)); + //Osc Switch Sense 2 register + l_scom_data.addData(DEVICE_SCOM_ADDRESS(0x0005001E)); //grab the clock regs via FSI too, just in case - if (i_target != TARGETING::MASTER_PROCESSOR_CHIP_TARGET_SENTINEL) + if (i_chipTarg != TARGETING::MASTER_PROCESSOR_CHIP_TARGET_SENTINEL) { TARGETING::Target* mproc = NULL; TARGETING::targetService().masterProcChipTargetHandle(mproc); - if (i_target != mproc) + if (i_chipTarg != mproc) { - l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2864));//==2819 - l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2868));//==281A + l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2874));//==281D + l_scom_data.addData(DEVICE_FSI_ADDRESS(0x2878));//==281E } } } + //PBA scoms on the processor + if( ((i_addr & 0xFFFFF000) == 0x00068000) + && (TARGETING::TYPE_PROC == l_type) ) + { + addit = true; + //look for hung operations on the PBA + uint64_t ffdc_regs[] = { + //grab the PBA buffers in case something is hung + 0x05012850, //PBARBUFVAL0 + 0x05012851, //PBARBUFVAL1 + 0x05012852, //PBARBUFVAL2 + 0x05012853, //PBARBUFVAL3 + 0x05012854, //PBARBUFVAL4 + 0x05012855, //PBARBUFVAL5 + 0x05012858, //PBAWBUFVAL0 + 0x05012859, //PBAWBUFVAL1 + }; + for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + { + l_scom_data.addData(DEVICE_SCOM_ADDRESS(ffdc_regs[x])); + } + } + //Core/EX/EQ scoms on the processor (not including PCB slave regs) + else if( (((i_addr & 0xF0000000) == 0x10000000) //CACHE + || ((i_addr & 0xF0000000) == 0x20000000)) //CORE + && ((i_addr & 0x00FF0000) != 0x000F0000) //PCB slave + && (TARGETING::TYPE_PROC == l_type) ) + { + addit = true; + uint8_t l_badChiplet = p9_scom_addr(i_addr).get_chiplet_id(); + //grab some data related to the PCB slave state + uint64_t ffdc_regs[] = { + 0x0F010A, //Special Wakeup Other + 0x0F010B, //Special Wakeup FSP + 0x0F010C, //Special Wakeup OCC + 0x0F010D, //Special Wakeup HYP + 0x0F0111, //PM State History FSP + }; + for( size_t x = 0; x < (sizeof(ffdc_regs)/sizeof(ffdc_regs[0])); x++ ) + { + p9_scom_addr l_scom(ffdc_regs[x]); + l_scom.set_chiplet_id(l_badChiplet); + l_scom_data.addData(DEVICE_SCOM_ADDRESS(l_scom.get_addr())); + } + } + + if( addit ) { l_scom_data.addToLog(i_err); } + + l_insideFFDC = false; } diff --git a/src/usr/scom/test/scomtest.H b/src/usr/scom/test/scomtest.H index 1358a5f4a..eb712665e 100644 --- a/src/usr/scom/test/scomtest.H +++ b/src/usr/scom/test/scomtest.H @@ -472,16 +472,16 @@ public: DEVICE_SCOM_ADDRESS(test_data[x].addr) ); if(!test_data[x].isFail && l_err ) { - TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom_proc> [%d] Write: Error from device : addr=0x%X, RC=%X", x, test_data[x].addr, l_err->reasonCode() ); - TS_FAIL( "ScomTest::test_IndirectScom_proc> ERROR : Unexpected error log from device write: addr=0x%X, RC=%X ", test_data[x].addr, l_err->reasonCode() ); + TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom> [%d] Write: Error from device : addr=0x%X, RC=%X", x, test_data[x].addr, l_err->reasonCode() ); + TS_FAIL( "ScomTest::test_IndirectScom> ERROR : Unexpected error log from device write: addr=0x%X, RC=%X ", test_data[x].addr, l_err->reasonCode() ); fails++; errlCommit(l_err,SCOM_COMP_ID); l_err = NULL; } else if(test_data[x].isFail && !l_err ) { - TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom_proc> [%d] Write: Expected an Error from device write: addr=0x%X", x, test_data[x].addr ); - TS_FAIL( "ScomTest::test_IndirectScom_proc> ERROR : Expected an error log from device write and did not get one : addr=0x%X", test_data[x].addr ); + TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom> [%d] Write: Expected an Error from device write: addr=0x%X", x, test_data[x].addr ); + TS_FAIL( "ScomTest::test_IndirectScom> ERROR : Expected an error log from device write and did not get one : addr=0x%X", test_data[x].addr ); fails++; } else if(l_err) @@ -515,16 +515,16 @@ public: if(!test_data[x].isFail && l_err ) { - TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScomreadWrite_proc> [%d] Read: Error from device : addr=0x%X, RC=%X", x, test_data[x].addr, l_err->reasonCode() ); - TS_FAIL( "ScomTest::test_IndirectScomreadWrite_proc> ERROR : Unexpected error log from read device : addr=0x%X, RC=%X", test_data[x].addr, l_err->reasonCode() ); + TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScomreadWrite> [%d] Read: Error from device : addr=0x%X, RC=%X", x, test_data[x].addr, l_err->reasonCode() ); + TS_FAIL( "ScomTest::test_IndirectScomreadWrite> ERROR : Unexpected error log from read device : addr=0x%X, RC=%X", test_data[x].addr, l_err->reasonCode() ); fails++; errlCommit(l_err,SCOM_COMP_ID); l_err = NULL; } else if(test_data[x].isFail && !l_err ) { - TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom_proc> [%d] Read: Expected an Error from device read : addr=0x%X", x, test_data[x].addr ); - TS_FAIL( "ScomTest::test_IndirectScom_proc> ERROR : Expected an error log from device read and did not get one : addr=0x%X", test_data[x].addr ); + TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScom> [%d] Read: Expected an Error from device read : addr=0x%X", x, test_data[x].addr ); + TS_FAIL( "ScomTest::test_IndirectScom> ERROR : Expected an error log from device read and did not get one : addr=0x%X", test_data[x].addr ); fails++; } else if(!test_data[x].isFail && @@ -532,8 +532,8 @@ public: (test_data[x].data & 0x000000000000FFFF)) ) { - TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScomreadWrite_proc> [%d] Read: Data miss-match : addr=0x%X, read_data=0x%llx, write_data=0x%llx", x, test_data[x].addr, read_data[x], test_data[x].data); - TS_FAIL( "ScomTest::test_IndirectScomreadWrite_proc> ERROR : Data miss-match between read and expected data read_data" ); + TRACFCOMP(g_trac_scom, "ScomTest::test_IndirectScomreadWrite> [%d] Read: Data miss-match : addr=0x%X, read_data=0x%llx, write_data=0x%llx", x, test_data[x].addr, read_data[x], test_data[x].data); + TS_FAIL( "ScomTest::test_IndirectScomreadWrite> ERROR : Data miss-match between read and expected data read_data" ); fails++; } else if(l_err) @@ -544,7 +544,7 @@ public: } - TRACFCOMP( g_trac_scom, "ScomTest::test_IndirectScomreadWrite_proc> %d/%d fails", fails, total ); + TRACFCOMP( g_trac_scom, "ScomTest::test_IndirectScomreadWrite> %d/%d fails", fails, total ); } /** @@ -1117,8 +1117,6 @@ public: TS_FAIL( "ScomTest::test_P9_translate> ERROR : Unexpected error log from write1" ); fails++; errlCommit(l_err,SCOM_COMP_ID); - - delete l_err; } else if(l_err == NULL && test_data[i].expectError) { @@ -1139,7 +1137,7 @@ public: else if(l_err && test_data[i].expectError) { delete l_err; - TRACFCOMP(g_trac_scom, "ScomTest::test_P9_translate_scom> Previous error expected"); + TRACFCOMP(g_trac_scom, "ScomTest::test_P9_translate_scom> Previous error expected"); } } else |