diff options
author | Benjamin Weisenbeck <bweisenb@us.ibm.com> | 2018-03-23 09:59:08 -0500 |
---|---|---|
committer | Zane C. Shelley <zshelle@us.ibm.com> | 2018-04-20 10:32:50 -0400 |
commit | f852aab7838c9402a32b09905851381e5b08f1ae (patch) | |
tree | 4086e52af5b9f95feb914f353d4f328c763ddbd4 /src/usr/diag/prdf/common | |
parent | c06a321964200ef77090b1b0e4e11ee903088508 (diff) | |
download | talos-hostboot-f852aab7838c9402a32b09905851381e5b08f1ae.tar.gz talos-hostboot-f852aab7838c9402a32b09905851381e5b08f1ae.zip |
PRD: TOD fault analysis
Change-Id: Iebc9e781756bb321f660fcbc1d20bbad4a1f4f61
RTC: 145750
CQ: SW423770
Backport: release-fips910
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/56673
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com>
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57501
CI-Ready: Zane C. Shelley <zshelle@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Diffstat (limited to 'src/usr/diag/prdf/common')
10 files changed, 1641 insertions, 162 deletions
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule index 128fdf716..669d3e5b5 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule @@ -151,6 +151,12 @@ actionclass self_th_1 threshold1; }; +actionclass self_H_th_1 +{ + calloutSelfHigh; + threshold1; +}; + actionclass self_th_5perHour { calloutSelfMed; diff --git a/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule b/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule index f8c36f52e..ea5139e97 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule @@ -2132,17 +2132,17 @@ group gTP_LFIR filter singlebit, cs_root_cause /** TP_LFIR[18] * TOD Logic: Summerized internal errors */ - (rTP_LFIR, bit(18)) ? defaultMaskedError; + (rTP_LFIR, bit(18)) ? analyzeTodBackupTopology; /** TP_LFIR[19] * TOD Logic: PIB Slave access errors */ - (rTP_LFIR, bit(19)) ? defaultMaskedError; + (rTP_LFIR, bit(19)) ? analyzePibError; /** TP_LFIR[20] - * TOD Logic: UNUSED in P9 + * TOD Logic: Error report from PHYP */ - (rTP_LFIR, bit(20)) ? defaultMaskedError; + (rTP_LFIR, bit(20)) ? analyzePhypTodError; /** TP_LFIR[21] * PCB slave Unmasked err summary diff --git a/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule b/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule index b06d9ff40..175dd66d3 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule @@ -2112,17 +2112,17 @@ group gTP_LFIR filter singlebit, cs_root_cause /** TP_LFIR[18] * TOD Logic: Summerized internal errors */ - (rTP_LFIR, bit(18)) ? defaultMaskedError; + (rTP_LFIR, bit(18)) ? analyzeTodBackupTopology; /** TP_LFIR[19] * TOD Logic: PIB Slave access errors */ - (rTP_LFIR, bit(19)) ? defaultMaskedError; + (rTP_LFIR, bit(19)) ? analyzePibError; /** TP_LFIR[20] - * TOD Logic: UNUSED in P9 + * TOD Logic: Error report from PHYP */ - (rTP_LFIR, bit(20)) ? defaultMaskedError; + (rTP_LFIR, bit(20)) ? analyzePhypTodError; /** TP_LFIR[21] * PCB slave Unmasked err summary diff --git a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule index 815a299fb..6c5f6d0e6 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule @@ -23,6 +23,294 @@ # # IBM_PROLOG_END_TAG +######################################################################## +# +# TOD Rules and Groups +# +######################################################################## + +rule TodErrors +{ + TOD_ERRORREGISTER & (~TOD_ERRORMASK) & (~TOD_ERRORACTION); +}; + + +group gTodErrors filter singlebit +{ + /** TOD_ERRORREGISTER[0] + * M_PATH_CONTROL_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(0)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[1] + * M_PATH_0_PARITY_ERROR + */ + (TodErrors,bit(1)) ? selfCapThr32TopReConfig; + + /** TOD_ERRORREGISTER[2] + * M_PATH_1_PARITY_ERROR + */ + (TodErrors,bit(2)) ? selfCapThr32TopReConfig; + + /** TOD_ERRORREGISTER[3] + * PCRP0_DATA_PARITY_ERROR + */ + (TodErrors,bit(3)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[4] + * PCRP1_DATA_PARITY_ERROR + */ + (TodErrors,bit(4)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[5] + * SCRP0_DATA_PARITY_ERROR + */ + (TodErrors,bit(5)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[6] + * SCRP1_DATA_PARITY_ERROR + */ + (TodErrors,bit(6)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[7] + * SPCR_DATA_PARITY_ERROR + */ + (TodErrors,bit(7)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[8] + * IPCR_DATA_PARITY_ERROR + */ + (TodErrors,bit(8)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[9] + * PSMSCR_DATA_PARITY_ERROR + */ + (TodErrors,bit(9)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[10] + * S_PATH_0_PARITY_ERROr + */ + (TodErrors,bit(10)) ? selfCapThr32TopReConfig; + + /** TOD_ERRORREGISTER[11] + * REG_0X08_DATA_PARITY_ERROR + */ + (TodErrors,bit(11)) ? selfCaptThr32; + + + /** TOD_ERRORREGISTER[12] + * M_PATH_STATUS_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(12)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[13] + * S_PATH_STATUS_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(13)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[14] + * M_PATH_0_STEP_CHECK_ERROR + */ + (TodErrors,bit(14)) ? analyzeStepCheckErr; + + /** TOD_ERRORREGISTER[15] + * M_PATH_1_STEP_CHECK_ERROR + */ + (TodErrors,bit(15)) ? analyzeStepCheckErr; + + /** TOD_ERRORREGISTER[16] + * S_PATH_0_STEP_CHECK_ERROR + */ + (TodErrors,bit(16)) ? analyzeStepCheckErr; + + /** TOD_ERRORREGISTER[17] + * I_PATH_STEP_CHECK_ERROR + */ + (TodErrors,bit(17)) ? analyzeStepCheckErr; + + /** TOD_ERRORREGISTER[18] + * PSS HAMMING DISTANCE + */ + (TodErrors,bit(18)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[19] + * MISC_RESET_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(19)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[20] + * S_PATH_0_PARITY_ERROR + */ + (TodErrors,bit(20)) ? selfCapThr32TopReConfig; + + /** TOD_ERRORREGISTER[21] + * S_PATH_1_STEP_CHECK_ERROR + */ + (TodErrors,bit(21)) ? analyzeStepCheckErr; + + /** TOD_ERRORREGISTER[22] + * I_PATH_DELAY_STEP_CHECK_PARITY_ERROR + */ + (TodErrors,bit(22)) ? selfCapThr32TopReConfig; + + /** TOD_ERRORREGISTER[23] + * REG_0X0C DATA_PARITY ERROR + */ + (TodErrors,bit(23)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[24] + * REG_0X11_0X12_0X13_0X14_0X15_0X16_DATA_PARITY_ERROR + */ + (TodErrors,bit(24)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[25] + * REG_0X17_0X18_0X21_0X22_DATA_PARITY_ERROR + */ + (TodErrors,bit(25)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[26] + * REG_0X1D_0X1E_0X1F_DATA_PARITY_ERROR + */ + (TodErrors,bit(26)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[27] + * TIMER_VALUE_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(27)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[28] + * LOW_ORDER_STEP_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(28)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[29] + * FSM_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(29)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[30] + * RX_TTYPE_CONTROL_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(30)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[31] + * REG_0X30_0X31_0X32_0X33_DATA_PARITY_ERROR + */ + (TodErrors,bit(31)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[32] + * CHIP_CONTROL_REG_DATA_PARITY_ERROR + */ + (TodErrors,bit(32)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[33] + * I_PATH_SYNC_CHECK_ERROR + */ + (TodErrors,bit(33)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[34] + * I_PATH_FSM_STATE_PARITY_ERROR + */ + (TodErrors,bit(34)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[35] + * I_PATH_TIME_REG_PARITY_ERROR + */ + (TodErrors,bit(35)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[36] + * I_PATH_TIME_REG_OVERFLOW + */ + (TodErrors,bit(36)) ? maskTodError; + + /** TOD_ERRORREGISTER[37] + * WOF_LOW_ORDER_STEP_COUNTER_PARITY_ERROR + */ + (TodErrors,bit(37)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[38|39|40|41|42|43] + * RX_TTYPE_1 + */ + (TodErrors,bit(38|39|40|41|42|43)) ? defaultMaskedError; + + #Note: For firmware all the TOD-PIB errors are informational by nature.So, + # not doing any special analysis. + /** TOD_ERRORREGISTER[44] + * PIB_SLAVE_ADDR_INVALID_ERROR + */ + (TodErrors,bit(44)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[45] + * PIB_SLAVE_WRITE_INVALID_ERROR + */ + (TodErrors,bit(45)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[46] + * PIB_SLAVE_READ_INVALID_ERROR + */ + (TodErrors,bit(46)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[47] + * PIB_SLAVE_ADDR_PARITY_ERROR + */ + (TodErrors,bit(47)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[48] + * PIB_SLAVE_DATA_PARITY_ERROR + */ + (TodErrors,bit(48)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[49] + * TTYPE_CONTROL_REG_DATA_PARITY_ERROR + */ + #Note: Based on discussion with with Hardware Team and PHYP, this error + #shall be routed to PHYP instead of FSP + (TodErrors,bit(49)) ? defaultMaskedError; + + /** TOD_ERRORREGISTER[50|51|52] + * PIB_MASTER_RSP_INFO_ERROR + */ + #ignoring TOD-PIB errors for any special analysis.Since errors are + #informational by nature. + (TodErrors,bit( 50|51|52 )) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[53] + * RX_TTYPE_INVALID_ERROR + */ + (TodErrors,bit(53 )) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[54] + * RX_TTYPE_4_DATA_PARITY_ERROR + */ + (TodErrors,bit(54)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[55] + * PIB_MASTER_REQUEST_ERROR + */ + (TodErrors,bit(55)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[56] + * PIB_RESET_DURING_PIB_ACCESS_ERROR + */ + (TodErrors,bit(56)) ? selfCaptThr32; + + /** TOD_ERRORREGISTER[57] + * EXTERNAL_XSTOP_ERROR + */ + #bit tells us that TOD has received an external check stop + #purpose is to merely provide an information. Hence not doing any + #analysis. + (TodErrors,bit(57)) ? defaultMaskedError; + + #bit[58:63] not implemented + /** TOD_ERRORREGISTER[58|59|60|61|62|63] + * SPARE_ERROR + */ + (TodErrors,bit(58|59|60|61|62|63)) ? defaultMaskedError; + +}; + + actionclass level2_M_self_L_th_32perDay { callout2ndLvlMed; @@ -192,3 +480,97 @@ actionclass calloutBusInterface_obus3_th_1 threshold1; }; +# TOD Actions: +# * Capture at least this chip TOD registers. +# +# * Threshold normal TOD errors (TOD error register) at 32/day. +# +# * Network Errors : Step Check Fault or "PHYP Failed Topology" +# - PHYP Failed Topology must be visible and "Request new Topology". +# - May have PHYP failure on another chip. +# - Capture TOD registers for whole system. +# - Isolate both topologies and make callout. + +/** + * Analyze TOD Register. + */ +actionclass TodReportByRegister +{ + try(analyze(gTodErrors),TodRegisterAnalyzeFail); +}; + +actionclass TodRegisterAnalyzeFail +{ + capture(TODReg); + self_H_th_1; +}; + +/** + * PHYP Network fault. + */ +actionclass TodReportByPHYP +{ + threshold1; + funccall("todStepCheckFault"); +}; + +/** + * TOD Step Check Fault - Isolate topology. + */ +actionclass analyzeStepCheckErr +{ + threshold32pday; + funccall("todStepCheckFault"); +}; + +/** action for tod errors which do not need any specific ananlysis */ + +actionclass selfCaptThr32 +{ + calloutSelfHigh; + capture(TODReg); + threshold32pday; +}; + +/** + * Mask indication from PHYP due to all cores evacuated. + * - Mask TOD errors from this chip. + * - Not visible unless xstp. + * - Request new topology if chip is MDMT. + */ +actionclass maskTodError +{ + self_H_th_1; + capture(TODReg); + funccall("ClearServiceCallFlag"); + funccall("todNewTopologyIfBackupMDMT"); +}; + +/** callout Proc reporting error. If threshold reaches 32 per day, request + * reconfiguration of topology. + */ +actionclass selfCapThr32TopReConfig +{ + selfCaptThr32; + funccall("requestTopologySwitch"); +}; + +/** analyzes backup topology if TOD error analysis is enabled */ +actionclass analyzeTodBackupTopology +{ + try( funccall("isTodDisabled"), TodReportByRegister ); +}; + +/** callout and gard self if TOD error analysis is enabled */ +actionclass analyzePibError +{ + capture(TODReg); + try( funccall("isTodDisabled"), self_H_th_1 ); +}; + +/** analyzes active topology if TOD error analysis is enabled */ +actionclass analyzePhypTodError +{ + try( funccall("isTodDisabled"), TodReportByPHYP ); +}; + diff --git a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule index e35f1c778..34be4c09e 100644 --- a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule +++ b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule @@ -118,209 +118,262 @@ }; ############################################################################ - # P9 PROC target HDCT additions (open power chkstop analysis) + # TOD Registers ############################################################################ - - register OCC_ERROR_REPORT_REG + register TOD_MPCR { - name "OCC ERROR REPORT REG"; - scomaddr 0x0101080a; - capture group default; + name "TOD M Path Ctrl"; + scomaddr 0x00040000; + capture group TODReg; }; - register PB_ERROR_REPORT + register TOD_PCRP0 { - name "PB ERROR REPORT REG"; - scomaddr 0x020110a1; - capture group default; + name "TOD Pri Port 0 Ctrl"; + scomaddr 0x00040001; + capture group TODReg; }; - register PB_PTY_ERROR_REPORT + register TOD_PCRP1 { - name "PB PTY ERROR REPORT REG"; - scomaddr 0x020110a2; - capture group default; + name "TOD Pri Port 1 Ctrl"; + scomaddr 0x00040002; + capture group TODReg; }; - register DMA_CERR_0 + register TOD_SCRP0 { - name "DMA CERR 0"; - scomaddr 0x02011057; - capture group default; + name "TOD Sec Port 0 Ctrl"; + scomaddr 0x00040003; + capture group TODReg; }; - register DMA_CERR_1 + register TOD_SCRP1 { - name "DMA CERR 1"; - scomaddr 0x02011058; - capture group default; - }; - - register PB_CENT_CR_ERROR - { - name "PB CENT CR ERROR"; - scomaddr 0x05011c2c; - capture group default; - }; - - register PBA_ERR_REPORT_0 - { - name "PBA ERROR REPORT 0"; - scomaddr 0x0501284c; - capture group default; - }; - - register PBA_ERR_REPORT_1 - { - name "PBA ERROR REPORT 1"; - scomaddr 0x0501284d; - capture group default; - }; - - register PBA_ERR_REPORT_2 - { - name "PBA ERROR REPORT 2"; - scomaddr 0x0501284e; - capture group default; - }; - - register PB_PTY_ERR_REPORT - { - name "PB PTY ERROR REPORT"; - scomaddr 0x05012C22; - capture group default; + name "TOD Sec Port 1 Ctrl"; + scomaddr 0x00040004; + capture group TODReg; }; register TOD_SLAVE_PATH_CTRL { name "TOD SLAVE PATH CTRL"; scomaddr 0x00040005; - capture group default; + capture group TODReg; }; register TOD_INTERNAL_PATH_CTRL { name "TOD INTERNAL PATH CTRL"; scomaddr 0x00040006; - capture group default; + capture group TODReg; }; register TOD_CONFIG_CTRL { name "TOD Prim Sec Config Control"; scomaddr 0x00040007; - capture group default; + capture group TODReg; }; - register TOD_PSS_MSS_STATUS + register TOD_STATUSREGISTER { name "TOD PSS MSS Status Reg"; scomaddr 0x00040008; - capture group default; + capture group TODReg; }; register TOD_MASTER_PATH_STATUS { name "TOD Master Path Status Reg"; scomaddr 0x00040009; - capture group default; + capture group TODReg; + }; + + register TOD_SPSR + { + name "TOD S PATH STATUS REG"; + scomaddr 0x0004000A; + capture group TODReg; + }; + + register TOD_CCR + { + name "TOD CHIP CTRL REG"; + scomaddr 0x00040010; + capture group TODReg; }; register TOD_MASTER_PATH0_STEP_STEERING { name "TOD Master Path0 Step Steering"; scomaddr 0x0004000E; - capture group default; + capture group TODReg; }; register TOD_MASTER_PATH1_STEP_STEERING { name "TOD Master Path1 Step Steering"; scomaddr 0x0004000F; - capture group default; + capture group TODReg; }; register TOD_TRACE_DATASET_1 { name "TOD Trace Dataset 1"; scomaddr 0x0004001D; - capture group default; + capture group TODReg; }; register TOD_TRACE_DATASET_2 { name "TOD Trace Dataset 2"; scomaddr 0x0004001E; - capture group default; + capture group TODReg; }; register TOD_TRACE_DATASET_3 { name "TOD Trace Dataset 3"; scomaddr 0x0004001F; - capture group default; + capture group TODReg; }; register OSC_ERROR_HOLD { name "OSC ERROR HOLD"; scomaddr 0x01020019; - capture group default; + capture group TODReg; }; register OSC_ERROR_MASK { name "OSC ERROR MASK"; scomaddr 0x0102001A; - capture group default; + capture group TODReg; }; register OSC_ERROR_MODE { name "OSC ERROR MODE"; scomaddr 0x0102001B; - capture group default; + capture group TODReg; }; register TOD_FSM_REGISTER { name "TOD FSM Register"; scomaddr 0x00040024; - capture group default; + capture group TODReg; }; register TOD_TX_TTYPE_CTRL_REG { name "TOD TX TType Ctrl reg"; scomaddr 0x00040027; - capture group default; + capture group TODReg; }; register TOD_RX_TTYPE_CTRL_REG { name "TOD RX TType Ctrl reg"; scomaddr 0x00040029; - capture group default; + capture group TODReg; }; - register TOD_ERROR_INTERRUPTS + register TOD_ERRORREGISTER { name "TOD Error and Interrupts"; scomaddr 0x00040030; - capture group default; + capture group TODReg; + reset (^, 0x40030); }; - register TOD_CERR_REPORT + register TOD_ERRORMASK { name "TOD CERR Report"; scomaddr 0x00040032; - capture group default; + capture group TODReg; }; - register TOD_ROUTE_ERRORS_TO_CORE + register TOD_ERRORACTION { name "TOD Route Errors to Core"; scomaddr 0x00040033; + capture group TODReg; + }; + + ############################################################################ + # P9 PROC target HDCT additions (open power chkstop analysis) + ############################################################################ + + register OCC_ERROR_REPORT_REG + { + name "OCC ERROR REPORT REG"; + scomaddr 0x0101080a; + capture group default; + }; + + register PB_ERROR_REPORT + { + name "PB ERROR REPORT REG"; + scomaddr 0x020110a1; + capture group default; + }; + + register PB_PTY_ERROR_REPORT + { + name "PB PTY ERROR REPORT REG"; + scomaddr 0x020110a2; + capture group default; + }; + + register DMA_CERR_0 + { + name "DMA CERR 0"; + scomaddr 0x02011057; + capture group default; + }; + + register DMA_CERR_1 + { + name "DMA CERR 1"; + scomaddr 0x02011058; + capture group default; + }; + + register PB_CENT_CR_ERROR + { + name "PB CENT CR ERROR"; + scomaddr 0x05011c2c; + capture group default; + }; + + register PBA_ERR_REPORT_0 + { + name "PBA ERROR REPORT 0"; + scomaddr 0x0501284c; + capture group default; + }; + + register PBA_ERR_REPORT_1 + { + name "PBA ERROR REPORT 1"; + scomaddr 0x0501284d; + capture group default; + }; + + register PBA_ERR_REPORT_2 + { + name "PBA ERROR REPORT 2"; + scomaddr 0x0501284e; + capture group default; + }; + + register PB_PTY_ERR_REPORT + { + name "PB PTY ERROR REPORT"; + scomaddr 0x05012C22; capture group default; }; diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C b/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C new file mode 100644 index 000000000..76468e6a3 --- /dev/null +++ b/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C @@ -0,0 +1,1106 @@ +/* IBM_PROLOG_BEGIN_TAG */ +/* This is an automatically generated prolog. */ +/* */ +/* $Source: src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C $ */ +/* */ +/* OpenPOWER HostBoot Project */ +/* */ +/* Contributors Listed Below - COPYRIGHT 2018 */ +/* [+] International Business Machines Corp. */ +/* */ +/* */ +/* Licensed under the Apache License, Version 2.0 (the "License"); */ +/* you may not use this file except in compliance with the License. */ +/* You may obtain a copy of the License at */ +/* */ +/* http://www.apache.org/licenses/LICENSE-2.0 */ +/* */ +/* Unless required by applicable law or agreed to in writing, software */ +/* distributed under the License is distributed on an "AS IS" BASIS, */ +/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */ +/* implied. See the License for the specific language governing */ +/* permissions and limitations under the License. */ +/* */ +/* IBM_PROLOG_END_TAG */ +/**@file prdfP9TodPlugins.C + * @brief defines all the TOD error plugins + */ + +#include <prdfPluginDef.H> +#include <prdfPluginMap.H> +#include <prdfExtensibleChip.H> +#include <iipSystem.h> +#include <prdfP9ProcDomain.H> +#include <prdfGlobal_common.H> +#include <iipServiceDataCollector.h> +#include <prdfRegisterCache.H> +#include <UtilHash.H> +#include <algorithm> +#include <prdfPlatProcConst.H> + +using namespace TARGETING; + +namespace PRDF +{ + +using namespace PlatServices; +using namespace TOD; + +/** @struct TodFaultData + * TOD Fault isolation information from a chip. + */ +struct TodFaultData +{ + TargetHandle_t chipReportingError; // target reporting tod error + bool phypDetectedFault; // phyp detected a TOD fault on this chip + // (on either topology) + bool isActiveMdmt; // Chip is MDMT on active topology + bool isBackupMdmt; // MDMT on backup topology + bool faultDetected[2]; // index 0 for fault on active topo, 1 for backup + bool isMdmtAndFaulty[2];// chip is MDMT and has a fault on same topo + bool activeTopologyIsPrimary; //topology selected as active + TargetHandle_t chipSourcingClk[2];//if not MDMT, which chip is tod clk src + uint32_t activeMasterPathPosition[2]; // Clock position providing the TOD + // clock source to an MDMT + + /** + *@brief Constructor + */ + explicit TodFaultData( TargetHandle_t i_procTgt ): + chipReportingError( i_procTgt ), + phypDetectedFault( false ), + isActiveMdmt( false ), + isBackupMdmt( false ) + { + faultDetected[0] = false; + faultDetected[1] = false; + isMdmtAndFaulty[0] = false; + isMdmtAndFaulty[1] = false; + activeTopologyIsPrimary = false; + chipSourcingClk[0] = NULL; + chipSourcingClk[1] = NULL; + activeMasterPathPosition[0] = 0; + activeMasterPathPosition[1] = 0; + } +}; + +/** @struct TodFaultData + * System TOD failover status + */ +struct TopologySwitchDetails +{ + bool masterPathHwFailOver; // hw failover status of master path + bool phypSwitchedTopology; // topology switch status by Phyp + + /** + * @brief Constructor + */ + TopologySwitchDetails(): + masterPathHwFailOver( false ), + phypSwitchedTopology( false ) + {} +}; + +namespace Proc +{ +/** + * @brief Captures all the tod registers of all functional Proc chips. + * @param i_stepcode The step code data struct + * @return SUCCESS. + */ +int32_t todCaptureRegisters( STEP_CODE_DATA_STRUCT & i_stepcode ) +{ + ProcDomain * l_procDomain = + (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN ); + + for( size_t i = 0; i < l_procDomain->GetSize(); i++ ) + { + RuleChip * l_chip = l_procDomain->LookUp( i ); + l_chip->CaptureErrorData( i_stepcode.service_data->GetCaptureData(), + Util::hashString( "TODReg" ) ); + } + return SUCCESS; +} + +/** + * @brief Clears Tod errors register and Tod error bits in TP_LFIR + * @param i_stepcode The step code data struct + * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise. + */ +int32_t todCleanUpErrors( STEP_CODE_DATA_STRUCT & i_stepcode ) +{ + #define PRDF_FUNC "[Proc::todCleanUpErrors] " + + uint32_t o_rc = SUCCESS; + +#ifdef __HOSTBOOT_RUNTIME + ProcDomain * l_procDomain = + (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN ); + + for( size_t i = 0; i < l_procDomain->GetSize(); i++ ) + { + int32_t l_rc = SUCCESS; + RuleChip * l_procChip = l_procDomain->LookUp( i ); + + // Clear bits 14,15,16,17,21,39 in TOD Error Register + // Bits in this register are cleared by writing 1 + SCAN_COMM_REGISTER_CLASS * l_todError = + l_procChip->getRegister( "TOD_ERRORREGISTER" ); + + l_rc = l_todError->Read(); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: " + "proc=0x%08x", l_procChip->GetId() ); + + // Continue to try clearing the other chips + o_rc = FAIL; + continue; + } + + uint64_t l_val = l_todError->GetBitFieldJustified( 0, 64 ); + l_val = l_val & 0x0003C40001000000ull; // bits 14,15,16,17,21,39 + + if ( 0 != l_val ) + { + l_todError->SetBitFieldJustified( 0, 64, l_val ); + l_rc = l_todError->Write(); + + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on TOD_ERRORREGISTER: " + "proc=0x%08x", l_procChip->GetId() ); + o_rc = FAIL; + continue; + } + } + + + // Next read shall cause Force Read + RegDataCache & regCache = RegDataCache::getCachedRegisters(); + regCache.flush( l_procChip, l_todError ); + + // Clear bits 18 and 20 in TPLFIR + SCAN_COMM_REGISTER_CLASS * l_andTpFir = + l_procChip->getRegister( "TP_LFIR_AND" ); + + l_andTpFir->setAllBits(); + l_andTpFir->ClearBit(18); + l_andTpFir->ClearBit(20); + + l_rc = l_andTpFir->Write(); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Write() failed on TP_LFIR_AND: " + "proc=0x%08x", l_procChip->GetId() ); + o_rc = FAIL; + continue; + } + } +#endif + return o_rc; + + #undef PRDF_FUNC +} + +/** + * @brief Investigates if there is a failover initiated by HW. + * @param i_chip chip reporting TOD errors + * @param io_faultData Tod fault info + * @param o_failoverStatus failover status + */ +void checkForHwInitiatedFailover( ExtensibleChip * i_chip, + TodFaultData & io_faultData, + TopologySwitchDetails & o_failoverStatus ) +{ + #define PRDF_FUNC "[Proc::checkForHwInitiatedFailover] " + + // This function detects whether an MDMT chip has switched its master path + // due to a clock fault. In this case, PRD gets an attention due to a step + // check error in Master Path 0. The failover modifies bit 12 of the TOD + // status register. PRD finds that both active and backup topolgy use the + // same master path (path 1). When PRD checks for faults on each topology + // we'll be looking at path 1 for both and find no faults there. So this + // function checks for the master patch failover case and marks the MDMT + // chip at fault appropriately. + + do + { + if( false == io_faultData.isActiveMdmt || + false == io_faultData.isBackupMdmt ) + { + // don't consider slave procs for this check + break; + } + + // Is MDMT in a failover state. + if(( false == io_faultData.isMdmtAndFaulty[0] && + false == io_faultData.isMdmtAndFaulty[1] )) + + { + // Get TOD Error register. + SCAN_COMM_REGISTER_CLASS * l_todError = + i_chip->getRegister("TOD_ERRORREGISTER"); + + uint32_t l_oscPos = 1; + + if ( SUCCESS != l_todError->Read() ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: " + "i_chip=0x%08x", i_chip->GetId() ); + break; + } + + if( l_todError->IsBitSet(14) ) + { + l_oscPos = 0; + } + + else if( !l_todError->IsBitSet(15)) + { + break; + } + + // We failed to capture a TOD error in master path. This implies + // a HW path failover has occurred. + o_failoverStatus.masterPathHwFailOver = true; + + uint32_t topPos = + ( true == o_failoverStatus.phypSwitchedTopology )? 1 : 0; + + io_faultData.faultDetected[topPos] = true; + io_faultData.isMdmtAndFaulty[topPos] = true; + io_faultData.activeMasterPathPosition[topPos] = l_oscPos; + + PRDF_TRAC( PRDF_FUNC "HW Initiated failover: MDMT 0x%08x " + "faulty, mpath pos: %d", i_chip->GetId(), + l_oscPos ); + } + + }while(0); + + #undef PRDF_FUNC +} + +/** + * @brief Analyzes the TOD error of a given proc + * @param i_chip chip reporting TOD errors + * @param o_faults list of Tod fault info + * @param i_stepcode The step code data struct + * @param io_failOverStatus topology failover status + * @return SUCCESS. + */ +int32_t todCollectFaultDataChip( ExtensibleChip * i_chip, + std::vector<TodFaultData> & o_faults, + STEP_CODE_DATA_STRUCT & i_stepcode, + TopologySwitchDetails & io_failOverStatus ) +{ + #define PRDF_FUNC "[Proc::todCollectFaultDataChip] " + + TargetHandle_t l_chipTarget = i_chip->GetChipHandle(); + TodFaultData l_faultData ( l_chipTarget ); + + uint32_t l_rc = FAIL; + + do + { + // Check if PHYP reported TOD error + SCAN_COMM_REGISTER_CLASS * l_pTpLFir = i_chip->getRegister( "TP_LFIR" ); + + l_rc = l_pTpLFir->Read(); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on TP_LFIR: i_chip=0x%08x", + i_chip->GetId() ); + break; + } + + l_faultData.phypDetectedFault = l_pTpLFir->IsBitSet(20); + + // Deterimine active topology. + SCAN_COMM_REGISTER_CLASS * l_todStatus = + i_chip->getRegister("TOD_STATUSREGISTER"); + + l_rc = l_todStatus->Read(); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on TOD_STATUSREGISTER: " + "i_chip=0x%08x", i_chip->GetId() ); + break; + } + + //Reading TOD_STATUSREGISTER[0:2] + //0b000 means configuration chosen is Primary + //0b111 means configuration chosen is Secondary + + bool l_activeIsPrimary = + ( 0 == l_todStatus->GetBitFieldJustified( 0, 3 ) ); + l_faultData.activeTopologyIsPrimary = l_activeIsPrimary; + + // Get TOD Error register. + SCAN_COMM_REGISTER_CLASS * l_todError = + i_chip->getRegister("TOD_ERRORREGISTER"); + + l_rc = l_todError->Read(); + if ( SUCCESS != l_rc ) + { + PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: " + "i_chip=0x%08x", i_chip->GetId() ); + break; + } + + // Check both topologies, active first. + for ( int i = 0; i < 2; i++ ) + { + // Each chip has 2 TOD topologies configured (primary and secondary) + // One of these is selected as active topology and one as backup + // In TodFaultData, index 0 is used for the active topology, and 1 + // for the backup. We also need to know whether we looking at the + // primary or secondary topology, because that will determine + // the bit positions we used in the TOD registers. + // So within this for loop, index 0/1 refers to active/backup + // l_topIsPri identifies whether the current topo was configured + // in the primary or secondary position. + + bool l_topIsPri = + ( ( 0 == i ) ? l_activeIsPrimary : !l_activeIsPrimary ); + + bool l_masterTodSelected = false ; + bool l_masterDrawerSelected = false; + + // Check if MDMT on current topology. + l_masterTodSelected = + l_todStatus->IsBitSet( l_topIsPri ? 13 : 17 ); + l_masterDrawerSelected = + l_todStatus->IsBitSet( l_topIsPri ? 14 : 18 ); + + // Check master OSC status if MDMT + if ( ( l_masterTodSelected ) && ( l_masterDrawerSelected ) ) + { + // Deterimine which OSC card is used. + bool l_osc0; //means master path 0 + bool l_oscFail; + + l_faultData.isActiveMdmt = l_todStatus->IsBitSet(23); + l_faultData.isBackupMdmt = l_todStatus->IsBitSet(24); + + l_osc0 = !l_todStatus->IsBitSet( l_topIsPri ? 12 : 16 ); + l_faultData.activeMasterPathPosition[i] = l_osc0 ? 0 : 1; + + // Read step check error bit in TOD error register + l_oscFail = l_todError->IsBitSet( l_osc0 ? 14 : 15 ); + + if ( l_oscFail ) + { + // Set fault data. + l_faultData.faultDetected[i] = true; + l_faultData.isMdmtAndFaulty[i] = true; + + PRDF_TRAC(PRDF_FUNC " MDMT: 0x%08x at Error, M-Path: %d, " + "topology: %c", + i_chip->GetId(), l_osc0 ? 0 : 1, + i == 0 ?'A':'B' ); + } + + }//if mdmt + + else // Is not MDMT on this topology. + { + // Deterimine whether slave chip is using Primary configuration + // slave path (slave path 0 )or secondary configuration slave + //path (slave path 1 ) + bool l_slv0 = !l_todStatus->IsBitSet( l_topIsPri ? 15 : 19 ); + + // Check if TOD slave path has any step check error. + // bit 16 and 21 of TOD_ERRORREGISTER indicate if there is any + // TOD Error in slave path. + + bool l_slvErr = l_todError->IsBitSet( l_slv0 ? 16 : 21 ); + + // If there is Step Check Error, we must determine proc sourcing + // clock to the chip reporting step check error. We do this by + // reading PCRP0 for primary configuration and SCRP1 for + // secondary configuration to determine which bus is being used + // to transmit tod clock. We can use that to get the peer proc + // at the other end of the bus. + + if ( l_slvErr ) + { + uint32_t l_connection = 0; + TargetHandle_t l_procClockSrc = NULL; + + uint32_t l_ret = FAIL; +#ifdef __HOSTBOOT_RUNTIME + l_ret = getTodPortControlReg( l_chipTarget, l_slv0, + l_connection ); +#endif + if( SUCCESS != l_ret ) continue; + + // The connection value is in bits 0:2. The scomdef doesn't + // define this very well: + // X0_PORT_0=>0b000 + // X1_PORT_0=>0b001 + // X2_PORT_0=>0b010 + // X3_PORT_0=>0b011 + // X4_PORT_0=>0b100 + // X5_PORT_0=>0b101 + // X6_PORT_0=>0b110 + // X7_PORT_0=>0b111 + // I've been told the actual definition is 0-2 for XBUS0-2 + // 3-6 for OBUS0-3, port 7 unused. + + l_connection >>= 29; + if ( l_connection > 6 ) + { + PRDF_ERR( PRDF_FUNC"Configuration error for 0x%08x " + "connection 0x%08x", getHuid(l_chipTarget), + l_connection ); + continue; + } + else + { + TYPE l_busType = TYPE_XBUS; + if ( l_connection > 2 ) + { + l_busType = TYPE_OBUS; + l_connection -= 3; + } + + l_procClockSrc = getConnectedPeerProc( l_chipTarget, + l_busType, + l_connection ); + } + + if( NULL == l_procClockSrc ) + { + l_procClockSrc = l_chipTarget; + } + + // Set fault data. + l_faultData.faultDetected[i] = true; + l_faultData.chipSourcingClk[i] = l_procClockSrc; + + PRDF_TRAC( PRDF_FUNC " Slave 0x%08x at Error S-Path %d," + "topology %c, clk source is 0x%08x", + i_chip->GetId(), l_slv0 ? 0:1, + i == 0 ? 'A':'B', + getHuid( l_procClockSrc ) ); + + } // error in slave + }//else not mdmt + }//for topology + + checkForHwInitiatedFailover( i_chip, l_faultData, io_failOverStatus ); + + // Check for an internal path error in active topology + uint32_t topPos = io_failOverStatus.phypSwitchedTopology ? 1 : 0; + if ( !l_faultData.faultDetected[topPos] && l_todError->IsBitSet(17) ) + { + l_faultData.faultDetected[topPos] = true; + l_faultData.chipSourcingClk[topPos] = l_chipTarget; + } + + o_faults.push_back( l_faultData ); + + l_rc = SUCCESS; + + } while(0); + + return l_rc; + + #undef PRDF_FUNC +} + +/** + * @brief Collects TOD fault error info for all procs in the system + * @param i_chip chip reporting TOD errors + * @param i_stepcode The step code data struct + * @param io_FailoverStatus hw initiated failover status + */ +void todCollectFaultDataSys( std::vector<TodFaultData> & o_faults, + STEP_CODE_DATA_STRUCT & i_stepcode, + TopologySwitchDetails & io_FailoverStatus ) +{ + ProcDomain * l_procDomain = + (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN ); + + for( size_t i = 0; i < l_procDomain->GetSize(); i++ ) + { + RuleChip * l_chip = l_procDomain->LookUp( i ); + uint32_t l_rc = todCollectFaultDataChip( l_chip, o_faults, + i_stepcode, + io_FailoverStatus ); + if( SUCCESS != l_rc ) + { + PRDF_ERR("[todCollectFaultDataSys] Failed to analyze tod errors in" + "chip 0x%08x",l_chip->GetId() ); + } + + } +} + +/** + * @brief Determines if Phyp switched the topology. + * @return o_topologySwitch topology switch status + */ +bool checkPhypSwitchedTopology( ) +{ + #define PRDF_FUNC "[checkPhypSwitchedTopology] " + + bool o_topologySwitch = false; + + ProcDomain * l_procDomain = + (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN ); + + for( size_t i = 0; i < l_procDomain->GetSize(); i++ ) + { + RuleChip * l_chip = l_procDomain->LookUp( i ); + // Get TOD Error register. + SCAN_COMM_REGISTER_CLASS * l_todError = + l_chip->getRegister("TOD_ERRORREGISTER"); + + if( SUCCESS != l_todError->Read() ) + { + PRDF_ERR( PRDF_FUNC"Read failed for tod error " + "register on 0x%08x", l_chip->GetId() ); + break; + } + + o_topologySwitch = l_todError->IsBitSet(39); + + if( true == o_topologySwitch ) + { + break; + } + } + + return o_topologySwitch; + #undef PRDF_FUNC +} + +/** + * @brief Collects FFDC associated with step errors. + * @param io_todErrorData contains fault status and data for all chips. + * @param i_failOverstatus contains master path and topology failover data. + * @param o_errorSummary contains FFDC associated with step errors. + */ +void collectTodErrorFfdc( std::vector<TodFaultData> & io_todErrorData, + TopologySwitchDetails i_failOverstatus, + TodErrorSummary & o_errorSummary ) +{ + std::vector<TodFaultData> faultyChip; + memset( &o_errorSummary, 0x00, sizeof(TodErrorSummary) ); + + for ( auto & i : io_todErrorData ) + { + if ( i.phypDetectedFault ) + { + o_errorSummary.phypDetectedTodError = 1; + } + + if( i.isActiveMdmt ) + { + o_errorSummary.activeMdmt = getHuid( i.chipReportingError ); + o_errorSummary.activeTopology = + i.activeTopologyIsPrimary ? 1 : 0; + // master path position selected for active MDMT + o_errorSummary.activeTopologyMastPath = + i.activeMasterPathPosition[0]; + } + + if( i.isBackupMdmt ) + { + o_errorSummary.backUpMdmt = getHuid( i.chipReportingError ); + // master path position selected for backup MDMT + o_errorSummary.backUpTopologyMastPath = + i.activeMasterPathPosition[1]; + } + + // Add to list if some error is detected. + if ( i.phypDetectedFault || i.faultDetected[0] || + i.faultDetected[1] ) + { + faultyChip.push_back( i ); + } + } + o_errorSummary.topologySwitchByPhyp = + i_failOverstatus.phypSwitchedTopology ? 1 :0 ; + + o_errorSummary.hardwareSwitchFlip = + i_failOverstatus.masterPathHwFailOver ? 1 : 0; + o_errorSummary.reserved = 0; + + io_todErrorData.empty(); + io_todErrorData = faultyChip; +} + +/** + * @brief Adds FFDC associated with step error as Capture data. + * @param i_stepcode Step Code Data Struct. + * @param i_chip Chip reporting TOD step error. + * @param i_errorSummary contains FFDC associated with step error. + */ +void addFfdcToCaptureData( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & i_stepcode, + TodErrorSummary & i_errorSummary ) +{ + size_t sz_w = sizeof(CPU_WORD); + size_t sz_t = + ((sizeof(TodErrorSummary) + sz_w - 1) / sz_w ) * sz_w; + uint8_t errorDataBuff[sz_t]; + memset( &errorDataBuff, 0x00, sz_t ); + memcpy( &errorDataBuff, &i_errorSummary, sizeof(TodErrorSummary) ); + + #if( __BYTE_ORDER == __LITTLE_ENDIAN ) + + for( uint32_t i = 0; i < sz_t / sz_w; i++ ) + { + ((CPU_WORD *)errorDataBuff)[i] = + htonl(( (CPU_WORD *) errorDataBuff)[i]); + } + + #endif + + BitString bs( sz_t * 8, (CPU_WORD *) & errorDataBuff ); + + CaptureData & cd = i_stepcode.service_data->GetCaptureData(); + cd.Add( i_chip->GetChipHandle(), Util::hashString("TOD_ERROR_DATA"), bs ); +} + +/** + * @brief Analyzes the step check error of all procs in the system + * @param i_chip chip reporting TOD errors + * @param i_stepcode The step code data struct + * @return SUCCESS. + */ +int32_t todStepCheckFault( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & i_stepcode ) +{ + #define PRDF_FUNC "[Proc::todStepCheckFault] " + + // When we analyze a step check fault, we will look at all chips in the + // system--both topologies. After we've collected TOD fault data on each + // chip, we will categorize the failure as: + // - MDMT Clock problem + // - Internal path error + // - Connection error between chips + // In case of connection error,we try to minimize the list of chips to the + // list of most probable chips causing TOD errors. Once all the chips at + // fault are isolated, hwsv is requested to create a new back up topology. + + // Collect TOD registers for FFDC. + todCaptureRegisters( i_stepcode ); + + // Collect TOD fault data. + std::vector<TodFaultData> l_faultData; + + // List of chips for HWSV to avoid when constructing a new backup topo + std::vector< TargetHandle_t > l_chipBlackList; + + // Osc for HWSV to avoid when constructing a new backup topology + // Since HB doesn't model osc targets, we need a proc and Osc position + TargetHandle_t procOscTgtBl = nullptr; // Proc target assoc with bad Osc + uint32_t oscPosBl = 0xFFFFFFFF; // Osc position relative to proc + + TopologySwitchDetails failOverstatus; + failOverstatus.phypSwitchedTopology = checkPhypSwitchedTopology( ); + todCollectFaultDataSys( l_faultData, i_stepcode, failOverstatus ); + TodErrorSummary todErrorFfdc; + collectTodErrorFfdc( l_faultData, failOverstatus, todErrorFfdc ); + + bool l_phypError = false; + TargetHandle_t mdmtList[2] = {NULL, NULL }; + uint8_t mdmtFailedOscPos[2] = {0xFF, 0xFF}; + uint8_t analysisSummary[2] = { NO_TOD_ERROR, NO_TOD_ERROR }; + bool l_allInternal = true; + bool l_foundFault = false; + + // Find MDMT chips at fault + for ( std::vector<TodFaultData>::iterator i = l_faultData.begin(); + i != l_faultData.end(); i++ ) + { + if ( i->phypDetectedFault ) + { + l_phypError = true; + } + + for ( int t = 0; t < 2; t++ ) + { + if( i->isMdmtAndFaulty[t] ) + { + mdmtList[t] = i->chipReportingError; + mdmtFailedOscPos[t] = i->activeMasterPathPosition[t]; + } + } + } + + if ( l_phypError ) + { + i_stepcode.service_data->SetThresholdMaskId(0); + } + + // Look at both topologies. + for ( int i = 0; i < 2; i++ ) + { + // Classifications of topology errors: + // 1) MDMT clock problem - callout clock or MDMT. + // 2) Internals only - callout chips. + // 3) Network error - clear internals, and isolate. + + // MDMT analysis + + if( NULL != mdmtList[i] ) + { + // HW initiated failover. Callout the failed OSC. + if ( failOverstatus.masterPathHwFailOver ) + { + i_stepcode.service_data->SetThresholdMaskId(0); + } + // Add Osc to blacklist + procOscTgtBl = mdmtList[i]; + oscPosBl = mdmtFailedOscPos[i]; + + // Add Proc to blacklist + l_chipBlackList.push_back( mdmtList[i] ); + + // Callout and gard TOD OSC +#ifdef __HOSTBOOT_MODULE + errlHndl_t errl = + ServiceGeneratorClass::ThisServiceGenerator().getErrl(); + if ( NULL == errl ) + { + PRDF_ERR( PRDF_FUNC "Failed to get the global error log" ); + break; + } + errl->addClockCallout( mdmtList[i], HWAS::TODCLK_TYPE, + HWAS::SRCI_PRIORITY_HIGH, + HWAS::DECONFIG, + HWAS::GARD_Predictive ); +#else + TargetHandle_t l_clockTarget = nullptr; + l_clockTarget = getConnectedChild( procOscTgtBl, + TYPE_TODCLK, + oscPosBl ); + if (l_clockTarget) + i_stepcode.service_data->SetCallout( l_clockTarget, MRU_HIGH ); +#endif + // Callout MDMT chip + i_stepcode.service_data->SetCallout(mdmtList[i], MRU_MEDA ); + + //callout a symbolic FRU to replace FRU/interfaces between Proc and + //TOD OSC card + i_stepcode.service_data->SetCallout( TOD_CLOCK_ERR, MRU_MED, + NO_GARD ); + analysisSummary[i] = MASTER_PATH_ERROR; + + // We have analyzed this topology to an MDMT fault, move on to the + // backup topology + continue; + } + + // Collect some information for further classification + for ( std::vector<TodFaultData>::iterator j = l_faultData.begin(); + j != l_faultData.end(); j++ ) + { + // If fault on topology. + if ( j->faultDetected[i] ) + { + l_foundFault = true; + + // Check if non-internal fault. + if( j->chipSourcingClk[i] != j->chipReportingError ) + { + // ignore internal path errors during hw failover. + l_allInternal = false; + } + } + } + + // Skip analysis if this topology has nothing. + if ( !l_foundFault ) + { + continue; + } + + if ( l_allInternal ) // Internal callouts. + { + + for ( std::vector<TodFaultData>::iterator j = l_faultData.begin(); + j != l_faultData.end(); j++ ) + { + if ( j->chipSourcingClk[i] == j->chipReportingError ) + { + + if ( NULL != j->chipReportingError ) + { + // update consolidated callout list and + //black list for internal path errors + i_stepcode.service_data->SetCallout( + j->chipReportingError,MRU_MED ); + l_chipBlackList.push_back( j->chipReportingError ); + } + } + } + + analysisSummary[i] = INTERNAL_PATH_ERROR; + } + else // Network callout. + { + // Clear all internal reports and get chips. + for ( std::vector<TodFaultData>::iterator j = l_faultData.begin(); + j != l_faultData.end(); j++ ) + { + if ( j->chipSourcingClk[i] == j->chipReportingError ) + { + j->faultDetected[i] = false; + } + } + + TargetHandleList l_rootList; + std::vector<TodFaultData>::iterator itSrc; + + for( itSrc = l_faultData.begin(); itSrc != l_faultData.end(); + itSrc++ ) + { + std::vector<TodFaultData>::iterator itReport; + bool l_badSrc = false; + + if( !itSrc->faultDetected[i] ) + continue; + + for( itReport = l_faultData.begin(); + itReport != l_faultData.end(); + itReport++ ) + { + // If proc A is getting its tod clock from proc B and both + // are reporting step check errors, we callout only B. + if( itSrc->chipSourcingClk[i] == + itReport->chipReportingError ) + { + if ( true == itReport->faultDetected[i] ) + { + l_badSrc = true; + l_rootList.push_back(itReport->chipReportingError); + + PRDF_TRAC( PRDF_FUNC "Network callout adding clk" + "source chip 0x%08x topology %c", + getHuid(itReport->chipReportingError ), + i == 0 ? 'A':'B' ); + } + break; + } + } + + if( !l_badSrc ) + { + l_rootList.push_back( itSrc->chipReportingError ); + PRDF_TRAC( PRDF_FUNC "Network callout adding chip 0x%08x " + "i = %c", getHuid( itSrc->chipReportingError ), + i == 0 ? 'A':'B' ); + } + } + + // Sort, remove unique. + std::sort( l_rootList.begin(), l_rootList.end() ); + std::vector<TargetHandle_t>::iterator itChip; + itChip = std::unique(l_rootList.begin(), l_rootList.end()); + l_rootList.erase( itChip,l_rootList.end() ); + + //Calling out the final list of chips reporting connection + //problem in TOD network. + for ( auto &failedChip : l_rootList ) + { + // update the consolidated callout list and + // black list for hwsv + i_stepcode.service_data->SetCallout( failedChip, MRU_MED ); + l_chipBlackList.push_back( failedChip ); + } //for l_rootList + + analysisSummary[i] = SLAVE_PATH_NETWORK_ERROR; + + }// else network error + + }//for topology + + std::sort( l_chipBlackList.begin(), l_chipBlackList.end() ); + std::vector<TargetHandle_t>::iterator itBlackList; + itBlackList = std::unique( l_chipBlackList.begin(), l_chipBlackList.end()); + l_chipBlackList.erase( itBlackList, l_chipBlackList.end() ); + + // Now we call HWSV to create a new backup topology. The chips in the black + // list will not be selected as the new MDMT. +#ifdef __HOSTBOOT_RUNTIME + todErrorFfdc.topologyResetRequested = 0; + if ( i_stepcode.service_data->IsAtThreshold() ) + { + requestNewTODTopology( oscPosBl, procOscTgtBl, + l_chipBlackList, !l_phypError ); + todErrorFfdc.topologyResetRequested = 1; + } +#endif + + // If we never made a callout, call out this chip. + if ( 0 == i_stepcode.service_data->getMruListSize() ) + { + i_stepcode.service_data->SetCallout( i_chip->GetChipHandle() ); + analysisSummary[0] = UNKNOWN_TOD_ERROR; + analysisSummary[1] = UNKNOWN_TOD_ERROR; + } + + // Clean up all TOD error reports. + if ( SUCCESS != todCleanUpErrors( i_stepcode ) ) + { + PRDF_ERR(PRDF_FUNC "Failed to clear TOD Errors of the" + "System" ); + } + + for( auto &blChip : l_chipBlackList ) + { + PRDF_TRAC( PRDF_FUNC"black list chip HUID: 0x%08x ", + getHuid( blChip ) ); + } + + if (procOscTgtBl) + { + PRDF_TRAC( PRDF_FUNC "black list osc chip HUID 0x%08x Pos %d", + getHuid(procOscTgtBl), oscPosBl ); + } + + // At last, add FFDC as capture data to error log + todErrorFfdc.activeTopologySummary = analysisSummary[0]; + todErrorFfdc.backUpTopologySummary = analysisSummary[1]; + addFfdcToCaptureData( i_chip, i_stepcode, todErrorFfdc ); + + return SUCCESS; + + #undef PRDF_FUNC +} +PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, todStepCheckFault ); +PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todStepCheckFault ); + +/** + * @brief Request for creation of a new back up topology. + * @param i_chip chip reporting TOD errors + * @param i_stepcode The step code data struct + * @return SUCCESS. + */ +int32_t todNewTopologyIfBackupMDMT( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & i_stepcode ) +{ +#ifdef __HOSTBOOT_RUNTIME + do + { + SCAN_COMM_REGISTER_CLASS * l_todStatus = + i_chip->getRegister( "TOD_STATUSREGISTER" ); + + if( SUCCESS != l_todStatus->Read( ) ) + { + PRDF_ERR("[todNewTopologyIfBackupMDMT] Failed to read TOD status" + "register, address 0x%16llx of proc 0x%08x ", + l_todStatus->GetAddress(),i_chip->GetId() ); + break; + } + + bool primaryIsActive = !( 0 == l_todStatus->GetBitFieldJustified( 0,3 ) ); + + /* Check this chips role + * Topology - 1 + * + * TOD_STATUS[13] TOD_STATUS[14] Inference + * 1 1 Mster TOD Master Drawer + * 0 1 Slave TOD Master Drawer + * 0 0 Slave TOD Slave Drawer + * 1 0 Master TOD Slave Drawer + + * Topology - 2 + * TOD_STATUS[17] TOD_STATUS[18] Inference + * + * Truth Table is same as above + */ + + // Check for MDMT status. + bool l_masterTodSelect; + bool l_masterDrawerSelect; + l_masterTodSelect = l_todStatus->IsBitSet( + 13 + ( primaryIsActive ? 0 : 4 ) ); + l_masterDrawerSelect = l_todStatus->IsBitSet( + 14 + ( primaryIsActive ? 0 : 4 ) ); + + // If this is the MDMT then request a new topology. + if( ( l_masterTodSelect ) && ( l_masterDrawerSelect ) ) + { + TargetHandleList badChipList; + badChipList.push_back( i_chip->GetChipHandle() ); + requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, false ); + } + + } while(0); +#endif + return SUCCESS; +} +PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, todNewTopologyIfBackupMDMT ); +PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todNewTopologyIfBackupMDMT ); + + +/** + * @brief Requests for a toplogy switch in response to logic parity error. + * @param i_chip chip reporting TOD logic parity error. + * @param i_stepcode The step code data struct + * @return SUCCESS. + */ +int32_t requestTopologySwitch( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & i_stepcode ) +{ +#ifdef __HOSTBOOT_RUNTIME + if ( i_stepcode.service_data->IsAtThreshold() ) + { + // Reconfigure the TOD topology and let PHYP know when backup is good. + TargetHandleList badChipList; + badChipList.push_back( i_chip->GetChipHandle( ) ); + requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, true ); + } +#endif + return SUCCESS; +} +PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, requestTopologySwitch ); +PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, requestTopologySwitch ); + +/** + * @brief Checks if TOD error analysis is disabled on platform. + * @param i_chip chip reporting TOD error. + * @param i_stepcode The step code data struct. + * @return SUCCESS if TOD analysis is disabled + */ +int32_t isTodDisabled( ExtensibleChip * i_chip, + STEP_CODE_DATA_STRUCT & i_stepcode ) +{ + int32_t o_rc = SUCCESS; + + if ( isHyprConfigOpal() ) + { + // On OPAL machine, mask TOD errors on first instance. There + // should not be any service action. + i_stepcode.service_data->setFlag( ServiceDataCollector::AT_THRESHOLD ); + i_stepcode.service_data->clearServiceCall(); + o_rc = SUCCESS; // TOD fault analysis not supported + } + else if ( isHyprRunning() && isHyprConfigPhyp() && + !isMfgAvpEnabled() && !isMfgHdatAvpEnabled() ) + { + o_rc = FAIL; // TOD Fault analysis is supported + } + else + { + i_stepcode.service_data->SetCallout( LEVEL2_SUPPORT, MRU_MED, NO_GARD ); + i_stepcode.service_data->SetCallout( SP_CODE, MRU_MED, NO_GARD ); + o_rc = SUCCESS; // TOD fault analysis not supported + } + + return o_rc; +} +PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, isTodDisabled ); +PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, isTodDisabled ); + +} //namespace Proc ends + +} //namespace PRDF ends diff --git a/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk b/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk index e4d67b930..6b6cf11e1 100644 --- a/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk +++ b/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk @@ -53,3 +53,4 @@ prd_rule_plugin += prdfLaneRepair.o prd_rule_plugin += prdfP9Ex.o prd_rule_plugin += prdfP9Ec.o prd_rule_plugin += prdfP9Eq.o +prd_rule_plugin += prdfP9TodPlugins.o diff --git a/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C b/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C index 725b126cf..66a3339b0 100644 --- a/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C +++ b/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2013,2017 */ +/* Contributors Listed Below - COPYRIGHT 2013,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -478,15 +478,11 @@ bool parseCaptureData( void * i_buffer, uint32_t i_buflen, { parseTdCtlrStateData( sigData, sigDataSize, i_parser, sigId ); } -/*TODO: RTC 136050 - else if ( Util::hashString(SLW_FFDC_DATA::title) == sigId ) - { - parseSlwFfdcData( sigData, sigDataSize, i_parser ); - } else if ( Util::hashString("TOD_ERROR_DATA") == sigId) { parseTodFfdcData( sigData, sigDataSize, i_parser ); } +/* else if ( Util::hashString("OCC_CS_FFDC") == sigId) { parsePnorFirData( sigData, sigDataSize, i_parser ); diff --git a/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H b/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H index de8c6fa81..5b0b78836 100644 --- a/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H +++ b/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2015 */ +/* Contributors Listed Below - COPYRIGHT 2015,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -35,28 +35,6 @@ namespace PRDF namespace TOD { - -//NOTE: Position at which strings representing names of TOD control registers -//are placed in regStatList cannot be changed independently. Position of string -//in the array must match value of corresponding enum member e.g. value -//of MPCR is zero, therefore, TOD_MPCR can only be placed at regStatList[0] - -// following enum represents all the tod control and status registers which -// need to be read and restored during data parity errors. - -enum Register -{ - MPCR = 0, - PCRP0 = 1, - PCRP1 = 2, - SCRP0 = 3, - SCRP1 = 4, - SPCR = 5, - IPCR = 6, - PSMSCR = 7, - LAST_TOD_REG = 8, -}; - /** * @brief summarizes error analysis for a TOD topology. */ @@ -77,12 +55,11 @@ struct TodErrorSummary { #if __BYTE_ORDER == __LITTLE_ENDIAN - uint32_t reserved :13; + uint32_t reserved :17; uint32_t backUpTopologyMastPath :2; //master path for backup topology uint32_t activeTopologyMastPath :2; // master path for active topology uint32_t backUpTopologySummary :3; // backup topology error status uint32_t activeTopologySummary :3; // active topology error status - uint32_t todOscCnt :4; // functional TOD OSC count in system uint32_t activeTopology :1; // Topology acting as Active uint32_t topologyResetRequested :1; // topology reset request status uint32_t topologySwitchByPhyp :1; // topology switch event detected @@ -96,12 +73,11 @@ struct TodErrorSummary uint32_t topologySwitchByPhyp :1; // topology switch event detected uint32_t topologyResetRequested :1; // topology reset request status uint32_t activeTopology :1; // Topology acting as Active - uint32_t todOscCnt :4; // functional TOD OSC count in system uint32_t activeTopologySummary :3; // active topology error status uint32_t backUpTopologySummary :3; // backup topology error status uint32_t activeTopologyMastPath :2; // master path for active topology uint32_t backUpTopologyMastPath :2; //master path for backup topology - uint32_t reserved :13; + uint32_t reserved :17; #endif uint32_t activeMdmt; // HUID of active mdmt diff --git a/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C b/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C index 461704ab9..d33792b77 100644 --- a/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C +++ b/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2014,2017 */ +/* Contributors Listed Below - COPYRIGHT 2014,2018 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -51,44 +51,6 @@ using namespace PARSER; using namespace TOD; //------------------------------------------------------------------------------ - -bool parseSlwFfdcData( uint8_t * i_buffer, uint32_t i_buflen, - ErrlUsrParser & i_parser ) -{ - char hdr[HEADER_SIZE] = ""; - char data[DATA_SIZE] = ""; - - snprintf( hdr, HEADER_SIZE, " %s", SLW_FFDC_DATA::title ); - i_parser.PrintString( hdr, "" ); - - const size_t sz_word = sizeof(uint32_t); - - uint32_t idx = 0; - while ( idx + SLW_FFDC_DATA::ENTRY_SIZE < i_buflen ) - { - uint32_t addr, val0, val1; - - memcpy( &addr, &i_buffer[idx ], sz_word ); - memcpy( &val0, &i_buffer[idx+(1*sz_word)], sz_word ); - memcpy( &val1, &i_buffer[idx+(2*sz_word)], sz_word ); - - addr = htonl(addr); - val0 = htonl(val0); - val1 = htonl(val1); - - snprintf(hdr, HEADER_SIZE, " Address: 0x%08x", addr ); - snprintf(data, DATA_SIZE, "Value: 0x%08x 0x%08x", val0, val1 ); - - i_parser.PrintString( hdr, data ); - - idx += SLW_FFDC_DATA::ENTRY_SIZE; - } - - return true; -} - -//------------------------------------------------------------------------------ - bool parseTodFfdcData( uint8_t * i_buffer, uint32_t i_buflen, ErrlUsrParser & i_parser ) { @@ -131,9 +93,6 @@ bool parseTodFfdcData( uint8_t * i_buffer, uint32_t i_buflen, errorData.activeTopology ? "Primary Config" : "Secondary Config" ); - i_parser.PrintNumber( "Functional TOD Osc", "0x%08x", - errorData.todOscCnt ); - snprintf(data, DATA_SIZE, "0x%08x", errorData.activeMdmt ); i_parser.PrintString( "Active MDMT", data ); |