summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBenjamin Weisenbeck <bweisenb@us.ibm.com>2018-03-23 09:59:08 -0500
committerZane C. Shelley <zshelle@us.ibm.com>2018-04-20 10:32:50 -0400
commitf852aab7838c9402a32b09905851381e5b08f1ae (patch)
tree4086e52af5b9f95feb914f353d4f328c763ddbd4
parentc06a321964200ef77090b1b0e4e11ee903088508 (diff)
downloadtalos-hostboot-f852aab7838c9402a32b09905851381e5b08f1ae.tar.gz
talos-hostboot-f852aab7838c9402a32b09905851381e5b08f1ae.zip
PRD: TOD fault analysis
Change-Id: Iebc9e781756bb321f660fcbc1d20bbad4a1f4f61 RTC: 145750 CQ: SW423770 Backport: release-fips910 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/56673 Reviewed-by: Matt Derksen <mderkse1@us.ibm.com> Reviewed-by: Caleb N. Palmer <cnpalmer@us.ibm.com> Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com> Reviewed-by: Zane C. Shelley <zshelle@us.ibm.com> Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57501 CI-Ready: Zane C. Shelley <zshelle@us.ibm.com> Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com> Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com> Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule6
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule8
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule8
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule382
-rw-r--r--src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule211
-rw-r--r--src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C1106
-rw-r--r--src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk1
-rw-r--r--src/usr/diag/prdf/common/plugins/prdfLogParse_common.C8
-rw-r--r--src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H30
-rw-r--r--src/usr/diag/prdf/common/plugins/prdfProcLogParse.C43
-rw-r--r--src/usr/diag/prdf/plat/prdfPlatServices_rt.C71
-rw-r--r--src/usr/diag/prdf/plat/prdfPlatServices_rt.H24
-rw-r--r--src/usr/diag/prdf/prdf_hb_only.mk2
13 files changed, 1738 insertions, 162 deletions
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule
index 128fdf716..669d3e5b5 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_common_actions.rule
@@ -151,6 +151,12 @@ actionclass self_th_1
threshold1;
};
+actionclass self_H_th_1
+{
+ calloutSelfHigh;
+ threshold1;
+};
+
actionclass self_th_5perHour
{
calloutSelfMed;
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule b/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule
index f8c36f52e..ea5139e97 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_cumulus.rule
@@ -2132,17 +2132,17 @@ group gTP_LFIR filter singlebit, cs_root_cause
/** TP_LFIR[18]
* TOD Logic: Summerized internal errors
*/
- (rTP_LFIR, bit(18)) ? defaultMaskedError;
+ (rTP_LFIR, bit(18)) ? analyzeTodBackupTopology;
/** TP_LFIR[19]
* TOD Logic: PIB Slave access errors
*/
- (rTP_LFIR, bit(19)) ? defaultMaskedError;
+ (rTP_LFIR, bit(19)) ? analyzePibError;
/** TP_LFIR[20]
- * TOD Logic: UNUSED in P9
+ * TOD Logic: Error report from PHYP
*/
- (rTP_LFIR, bit(20)) ? defaultMaskedError;
+ (rTP_LFIR, bit(20)) ? analyzePhypTodError;
/** TP_LFIR[21]
* PCB slave Unmasked err summary
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule b/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule
index b06d9ff40..175dd66d3 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_nimbus.rule
@@ -2112,17 +2112,17 @@ group gTP_LFIR filter singlebit, cs_root_cause
/** TP_LFIR[18]
* TOD Logic: Summerized internal errors
*/
- (rTP_LFIR, bit(18)) ? defaultMaskedError;
+ (rTP_LFIR, bit(18)) ? analyzeTodBackupTopology;
/** TP_LFIR[19]
* TOD Logic: PIB Slave access errors
*/
- (rTP_LFIR, bit(19)) ? defaultMaskedError;
+ (rTP_LFIR, bit(19)) ? analyzePibError;
/** TP_LFIR[20]
- * TOD Logic: UNUSED in P9
+ * TOD Logic: Error report from PHYP
*/
- (rTP_LFIR, bit(20)) ? defaultMaskedError;
+ (rTP_LFIR, bit(20)) ? analyzePhypTodError;
/** TP_LFIR[21]
* PCB slave Unmasked err summary
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule
index 815a299fb..6c5f6d0e6 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_actions.rule
@@ -23,6 +23,294 @@
#
# IBM_PROLOG_END_TAG
+########################################################################
+#
+# TOD Rules and Groups
+#
+########################################################################
+
+rule TodErrors
+{
+ TOD_ERRORREGISTER & (~TOD_ERRORMASK) & (~TOD_ERRORACTION);
+};
+
+
+group gTodErrors filter singlebit
+{
+ /** TOD_ERRORREGISTER[0]
+ * M_PATH_CONTROL_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(0)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[1]
+ * M_PATH_0_PARITY_ERROR
+ */
+ (TodErrors,bit(1)) ? selfCapThr32TopReConfig;
+
+ /** TOD_ERRORREGISTER[2]
+ * M_PATH_1_PARITY_ERROR
+ */
+ (TodErrors,bit(2)) ? selfCapThr32TopReConfig;
+
+ /** TOD_ERRORREGISTER[3]
+ * PCRP0_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(3)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[4]
+ * PCRP1_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(4)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[5]
+ * SCRP0_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(5)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[6]
+ * SCRP1_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(6)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[7]
+ * SPCR_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(7)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[8]
+ * IPCR_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(8)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[9]
+ * PSMSCR_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(9)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[10]
+ * S_PATH_0_PARITY_ERROr
+ */
+ (TodErrors,bit(10)) ? selfCapThr32TopReConfig;
+
+ /** TOD_ERRORREGISTER[11]
+ * REG_0X08_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(11)) ? selfCaptThr32;
+
+
+ /** TOD_ERRORREGISTER[12]
+ * M_PATH_STATUS_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(12)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[13]
+ * S_PATH_STATUS_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(13)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[14]
+ * M_PATH_0_STEP_CHECK_ERROR
+ */
+ (TodErrors,bit(14)) ? analyzeStepCheckErr;
+
+ /** TOD_ERRORREGISTER[15]
+ * M_PATH_1_STEP_CHECK_ERROR
+ */
+ (TodErrors,bit(15)) ? analyzeStepCheckErr;
+
+ /** TOD_ERRORREGISTER[16]
+ * S_PATH_0_STEP_CHECK_ERROR
+ */
+ (TodErrors,bit(16)) ? analyzeStepCheckErr;
+
+ /** TOD_ERRORREGISTER[17]
+ * I_PATH_STEP_CHECK_ERROR
+ */
+ (TodErrors,bit(17)) ? analyzeStepCheckErr;
+
+ /** TOD_ERRORREGISTER[18]
+ * PSS HAMMING DISTANCE
+ */
+ (TodErrors,bit(18)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[19]
+ * MISC_RESET_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(19)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[20]
+ * S_PATH_0_PARITY_ERROR
+ */
+ (TodErrors,bit(20)) ? selfCapThr32TopReConfig;
+
+ /** TOD_ERRORREGISTER[21]
+ * S_PATH_1_STEP_CHECK_ERROR
+ */
+ (TodErrors,bit(21)) ? analyzeStepCheckErr;
+
+ /** TOD_ERRORREGISTER[22]
+ * I_PATH_DELAY_STEP_CHECK_PARITY_ERROR
+ */
+ (TodErrors,bit(22)) ? selfCapThr32TopReConfig;
+
+ /** TOD_ERRORREGISTER[23]
+ * REG_0X0C DATA_PARITY ERROR
+ */
+ (TodErrors,bit(23)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[24]
+ * REG_0X11_0X12_0X13_0X14_0X15_0X16_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(24)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[25]
+ * REG_0X17_0X18_0X21_0X22_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(25)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[26]
+ * REG_0X1D_0X1E_0X1F_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(26)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[27]
+ * TIMER_VALUE_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(27)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[28]
+ * LOW_ORDER_STEP_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(28)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[29]
+ * FSM_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(29)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[30]
+ * RX_TTYPE_CONTROL_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(30)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[31]
+ * REG_0X30_0X31_0X32_0X33_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(31)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[32]
+ * CHIP_CONTROL_REG_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(32)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[33]
+ * I_PATH_SYNC_CHECK_ERROR
+ */
+ (TodErrors,bit(33)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[34]
+ * I_PATH_FSM_STATE_PARITY_ERROR
+ */
+ (TodErrors,bit(34)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[35]
+ * I_PATH_TIME_REG_PARITY_ERROR
+ */
+ (TodErrors,bit(35)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[36]
+ * I_PATH_TIME_REG_OVERFLOW
+ */
+ (TodErrors,bit(36)) ? maskTodError;
+
+ /** TOD_ERRORREGISTER[37]
+ * WOF_LOW_ORDER_STEP_COUNTER_PARITY_ERROR
+ */
+ (TodErrors,bit(37)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[38|39|40|41|42|43]
+ * RX_TTYPE_1
+ */
+ (TodErrors,bit(38|39|40|41|42|43)) ? defaultMaskedError;
+
+ #Note: For firmware all the TOD-PIB errors are informational by nature.So,
+ # not doing any special analysis.
+ /** TOD_ERRORREGISTER[44]
+ * PIB_SLAVE_ADDR_INVALID_ERROR
+ */
+ (TodErrors,bit(44)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[45]
+ * PIB_SLAVE_WRITE_INVALID_ERROR
+ */
+ (TodErrors,bit(45)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[46]
+ * PIB_SLAVE_READ_INVALID_ERROR
+ */
+ (TodErrors,bit(46)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[47]
+ * PIB_SLAVE_ADDR_PARITY_ERROR
+ */
+ (TodErrors,bit(47)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[48]
+ * PIB_SLAVE_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(48)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[49]
+ * TTYPE_CONTROL_REG_DATA_PARITY_ERROR
+ */
+ #Note: Based on discussion with with Hardware Team and PHYP, this error
+ #shall be routed to PHYP instead of FSP
+ (TodErrors,bit(49)) ? defaultMaskedError;
+
+ /** TOD_ERRORREGISTER[50|51|52]
+ * PIB_MASTER_RSP_INFO_ERROR
+ */
+ #ignoring TOD-PIB errors for any special analysis.Since errors are
+ #informational by nature.
+ (TodErrors,bit( 50|51|52 )) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[53]
+ * RX_TTYPE_INVALID_ERROR
+ */
+ (TodErrors,bit(53 )) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[54]
+ * RX_TTYPE_4_DATA_PARITY_ERROR
+ */
+ (TodErrors,bit(54)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[55]
+ * PIB_MASTER_REQUEST_ERROR
+ */
+ (TodErrors,bit(55)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[56]
+ * PIB_RESET_DURING_PIB_ACCESS_ERROR
+ */
+ (TodErrors,bit(56)) ? selfCaptThr32;
+
+ /** TOD_ERRORREGISTER[57]
+ * EXTERNAL_XSTOP_ERROR
+ */
+ #bit tells us that TOD has received an external check stop
+ #purpose is to merely provide an information. Hence not doing any
+ #analysis.
+ (TodErrors,bit(57)) ? defaultMaskedError;
+
+ #bit[58:63] not implemented
+ /** TOD_ERRORREGISTER[58|59|60|61|62|63]
+ * SPARE_ERROR
+ */
+ (TodErrors,bit(58|59|60|61|62|63)) ? defaultMaskedError;
+
+};
+
+
actionclass level2_M_self_L_th_32perDay
{
callout2ndLvlMed;
@@ -192,3 +480,97 @@ actionclass calloutBusInterface_obus3_th_1
threshold1;
};
+# TOD Actions:
+# * Capture at least this chip TOD registers.
+#
+# * Threshold normal TOD errors (TOD error register) at 32/day.
+#
+# * Network Errors : Step Check Fault or "PHYP Failed Topology"
+# - PHYP Failed Topology must be visible and "Request new Topology".
+# - May have PHYP failure on another chip.
+# - Capture TOD registers for whole system.
+# - Isolate both topologies and make callout.
+
+/**
+ * Analyze TOD Register.
+ */
+actionclass TodReportByRegister
+{
+ try(analyze(gTodErrors),TodRegisterAnalyzeFail);
+};
+
+actionclass TodRegisterAnalyzeFail
+{
+ capture(TODReg);
+ self_H_th_1;
+};
+
+/**
+ * PHYP Network fault.
+ */
+actionclass TodReportByPHYP
+{
+ threshold1;
+ funccall("todStepCheckFault");
+};
+
+/**
+ * TOD Step Check Fault - Isolate topology.
+ */
+actionclass analyzeStepCheckErr
+{
+ threshold32pday;
+ funccall("todStepCheckFault");
+};
+
+/** action for tod errors which do not need any specific ananlysis */
+
+actionclass selfCaptThr32
+{
+ calloutSelfHigh;
+ capture(TODReg);
+ threshold32pday;
+};
+
+/**
+ * Mask indication from PHYP due to all cores evacuated.
+ * - Mask TOD errors from this chip.
+ * - Not visible unless xstp.
+ * - Request new topology if chip is MDMT.
+ */
+actionclass maskTodError
+{
+ self_H_th_1;
+ capture(TODReg);
+ funccall("ClearServiceCallFlag");
+ funccall("todNewTopologyIfBackupMDMT");
+};
+
+/** callout Proc reporting error. If threshold reaches 32 per day, request
+ * reconfiguration of topology.
+ */
+actionclass selfCapThr32TopReConfig
+{
+ selfCaptThr32;
+ funccall("requestTopologySwitch");
+};
+
+/** analyzes backup topology if TOD error analysis is enabled */
+actionclass analyzeTodBackupTopology
+{
+ try( funccall("isTodDisabled"), TodReportByRegister );
+};
+
+/** callout and gard self if TOD error analysis is enabled */
+actionclass analyzePibError
+{
+ capture(TODReg);
+ try( funccall("isTodDisabled"), self_H_th_1 );
+};
+
+/** analyzes active topology if TOD error analysis is enabled */
+actionclass analyzePhypTodError
+{
+ try( funccall("isTodDisabled"), TodReportByPHYP );
+};
+
diff --git a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule
index e35f1c778..34be4c09e 100644
--- a/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule
+++ b/src/usr/diag/prdf/common/plat/p9/p9_proc_common_regs.rule
@@ -118,209 +118,262 @@
};
############################################################################
- # P9 PROC target HDCT additions (open power chkstop analysis)
+ # TOD Registers
############################################################################
-
- register OCC_ERROR_REPORT_REG
+ register TOD_MPCR
{
- name "OCC ERROR REPORT REG";
- scomaddr 0x0101080a;
- capture group default;
+ name "TOD M Path Ctrl";
+ scomaddr 0x00040000;
+ capture group TODReg;
};
- register PB_ERROR_REPORT
+ register TOD_PCRP0
{
- name "PB ERROR REPORT REG";
- scomaddr 0x020110a1;
- capture group default;
+ name "TOD Pri Port 0 Ctrl";
+ scomaddr 0x00040001;
+ capture group TODReg;
};
- register PB_PTY_ERROR_REPORT
+ register TOD_PCRP1
{
- name "PB PTY ERROR REPORT REG";
- scomaddr 0x020110a2;
- capture group default;
+ name "TOD Pri Port 1 Ctrl";
+ scomaddr 0x00040002;
+ capture group TODReg;
};
- register DMA_CERR_0
+ register TOD_SCRP0
{
- name "DMA CERR 0";
- scomaddr 0x02011057;
- capture group default;
+ name "TOD Sec Port 0 Ctrl";
+ scomaddr 0x00040003;
+ capture group TODReg;
};
- register DMA_CERR_1
+ register TOD_SCRP1
{
- name "DMA CERR 1";
- scomaddr 0x02011058;
- capture group default;
- };
-
- register PB_CENT_CR_ERROR
- {
- name "PB CENT CR ERROR";
- scomaddr 0x05011c2c;
- capture group default;
- };
-
- register PBA_ERR_REPORT_0
- {
- name "PBA ERROR REPORT 0";
- scomaddr 0x0501284c;
- capture group default;
- };
-
- register PBA_ERR_REPORT_1
- {
- name "PBA ERROR REPORT 1";
- scomaddr 0x0501284d;
- capture group default;
- };
-
- register PBA_ERR_REPORT_2
- {
- name "PBA ERROR REPORT 2";
- scomaddr 0x0501284e;
- capture group default;
- };
-
- register PB_PTY_ERR_REPORT
- {
- name "PB PTY ERROR REPORT";
- scomaddr 0x05012C22;
- capture group default;
+ name "TOD Sec Port 1 Ctrl";
+ scomaddr 0x00040004;
+ capture group TODReg;
};
register TOD_SLAVE_PATH_CTRL
{
name "TOD SLAVE PATH CTRL";
scomaddr 0x00040005;
- capture group default;
+ capture group TODReg;
};
register TOD_INTERNAL_PATH_CTRL
{
name "TOD INTERNAL PATH CTRL";
scomaddr 0x00040006;
- capture group default;
+ capture group TODReg;
};
register TOD_CONFIG_CTRL
{
name "TOD Prim Sec Config Control";
scomaddr 0x00040007;
- capture group default;
+ capture group TODReg;
};
- register TOD_PSS_MSS_STATUS
+ register TOD_STATUSREGISTER
{
name "TOD PSS MSS Status Reg";
scomaddr 0x00040008;
- capture group default;
+ capture group TODReg;
};
register TOD_MASTER_PATH_STATUS
{
name "TOD Master Path Status Reg";
scomaddr 0x00040009;
- capture group default;
+ capture group TODReg;
+ };
+
+ register TOD_SPSR
+ {
+ name "TOD S PATH STATUS REG";
+ scomaddr 0x0004000A;
+ capture group TODReg;
+ };
+
+ register TOD_CCR
+ {
+ name "TOD CHIP CTRL REG";
+ scomaddr 0x00040010;
+ capture group TODReg;
};
register TOD_MASTER_PATH0_STEP_STEERING
{
name "TOD Master Path0 Step Steering";
scomaddr 0x0004000E;
- capture group default;
+ capture group TODReg;
};
register TOD_MASTER_PATH1_STEP_STEERING
{
name "TOD Master Path1 Step Steering";
scomaddr 0x0004000F;
- capture group default;
+ capture group TODReg;
};
register TOD_TRACE_DATASET_1
{
name "TOD Trace Dataset 1";
scomaddr 0x0004001D;
- capture group default;
+ capture group TODReg;
};
register TOD_TRACE_DATASET_2
{
name "TOD Trace Dataset 2";
scomaddr 0x0004001E;
- capture group default;
+ capture group TODReg;
};
register TOD_TRACE_DATASET_3
{
name "TOD Trace Dataset 3";
scomaddr 0x0004001F;
- capture group default;
+ capture group TODReg;
};
register OSC_ERROR_HOLD
{
name "OSC ERROR HOLD";
scomaddr 0x01020019;
- capture group default;
+ capture group TODReg;
};
register OSC_ERROR_MASK
{
name "OSC ERROR MASK";
scomaddr 0x0102001A;
- capture group default;
+ capture group TODReg;
};
register OSC_ERROR_MODE
{
name "OSC ERROR MODE";
scomaddr 0x0102001B;
- capture group default;
+ capture group TODReg;
};
register TOD_FSM_REGISTER
{
name "TOD FSM Register";
scomaddr 0x00040024;
- capture group default;
+ capture group TODReg;
};
register TOD_TX_TTYPE_CTRL_REG
{
name "TOD TX TType Ctrl reg";
scomaddr 0x00040027;
- capture group default;
+ capture group TODReg;
};
register TOD_RX_TTYPE_CTRL_REG
{
name "TOD RX TType Ctrl reg";
scomaddr 0x00040029;
- capture group default;
+ capture group TODReg;
};
- register TOD_ERROR_INTERRUPTS
+ register TOD_ERRORREGISTER
{
name "TOD Error and Interrupts";
scomaddr 0x00040030;
- capture group default;
+ capture group TODReg;
+ reset (^, 0x40030);
};
- register TOD_CERR_REPORT
+ register TOD_ERRORMASK
{
name "TOD CERR Report";
scomaddr 0x00040032;
- capture group default;
+ capture group TODReg;
};
- register TOD_ROUTE_ERRORS_TO_CORE
+ register TOD_ERRORACTION
{
name "TOD Route Errors to Core";
scomaddr 0x00040033;
+ capture group TODReg;
+ };
+
+ ############################################################################
+ # P9 PROC target HDCT additions (open power chkstop analysis)
+ ############################################################################
+
+ register OCC_ERROR_REPORT_REG
+ {
+ name "OCC ERROR REPORT REG";
+ scomaddr 0x0101080a;
+ capture group default;
+ };
+
+ register PB_ERROR_REPORT
+ {
+ name "PB ERROR REPORT REG";
+ scomaddr 0x020110a1;
+ capture group default;
+ };
+
+ register PB_PTY_ERROR_REPORT
+ {
+ name "PB PTY ERROR REPORT REG";
+ scomaddr 0x020110a2;
+ capture group default;
+ };
+
+ register DMA_CERR_0
+ {
+ name "DMA CERR 0";
+ scomaddr 0x02011057;
+ capture group default;
+ };
+
+ register DMA_CERR_1
+ {
+ name "DMA CERR 1";
+ scomaddr 0x02011058;
+ capture group default;
+ };
+
+ register PB_CENT_CR_ERROR
+ {
+ name "PB CENT CR ERROR";
+ scomaddr 0x05011c2c;
+ capture group default;
+ };
+
+ register PBA_ERR_REPORT_0
+ {
+ name "PBA ERROR REPORT 0";
+ scomaddr 0x0501284c;
+ capture group default;
+ };
+
+ register PBA_ERR_REPORT_1
+ {
+ name "PBA ERROR REPORT 1";
+ scomaddr 0x0501284d;
+ capture group default;
+ };
+
+ register PBA_ERR_REPORT_2
+ {
+ name "PBA ERROR REPORT 2";
+ scomaddr 0x0501284e;
+ capture group default;
+ };
+
+ register PB_PTY_ERR_REPORT
+ {
+ name "PB PTY ERROR REPORT";
+ scomaddr 0x05012C22;
capture group default;
};
diff --git a/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C b/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C
new file mode 100644
index 000000000..76468e6a3
--- /dev/null
+++ b/src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C
@@ -0,0 +1,1106 @@
+/* IBM_PROLOG_BEGIN_TAG */
+/* This is an automatically generated prolog. */
+/* */
+/* $Source: src/usr/diag/prdf/common/plat/p9/prdfP9TodPlugins.C $ */
+/* */
+/* OpenPOWER HostBoot Project */
+/* */
+/* Contributors Listed Below - COPYRIGHT 2018 */
+/* [+] International Business Machines Corp. */
+/* */
+/* */
+/* Licensed under the Apache License, Version 2.0 (the "License"); */
+/* you may not use this file except in compliance with the License. */
+/* You may obtain a copy of the License at */
+/* */
+/* http://www.apache.org/licenses/LICENSE-2.0 */
+/* */
+/* Unless required by applicable law or agreed to in writing, software */
+/* distributed under the License is distributed on an "AS IS" BASIS, */
+/* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or */
+/* implied. See the License for the specific language governing */
+/* permissions and limitations under the License. */
+/* */
+/* IBM_PROLOG_END_TAG */
+/**@file prdfP9TodPlugins.C
+ * @brief defines all the TOD error plugins
+ */
+
+#include <prdfPluginDef.H>
+#include <prdfPluginMap.H>
+#include <prdfExtensibleChip.H>
+#include <iipSystem.h>
+#include <prdfP9ProcDomain.H>
+#include <prdfGlobal_common.H>
+#include <iipServiceDataCollector.h>
+#include <prdfRegisterCache.H>
+#include <UtilHash.H>
+#include <algorithm>
+#include <prdfPlatProcConst.H>
+
+using namespace TARGETING;
+
+namespace PRDF
+{
+
+using namespace PlatServices;
+using namespace TOD;
+
+/** @struct TodFaultData
+ * TOD Fault isolation information from a chip.
+ */
+struct TodFaultData
+{
+ TargetHandle_t chipReportingError; // target reporting tod error
+ bool phypDetectedFault; // phyp detected a TOD fault on this chip
+ // (on either topology)
+ bool isActiveMdmt; // Chip is MDMT on active topology
+ bool isBackupMdmt; // MDMT on backup topology
+ bool faultDetected[2]; // index 0 for fault on active topo, 1 for backup
+ bool isMdmtAndFaulty[2];// chip is MDMT and has a fault on same topo
+ bool activeTopologyIsPrimary; //topology selected as active
+ TargetHandle_t chipSourcingClk[2];//if not MDMT, which chip is tod clk src
+ uint32_t activeMasterPathPosition[2]; // Clock position providing the TOD
+ // clock source to an MDMT
+
+ /**
+ *@brief Constructor
+ */
+ explicit TodFaultData( TargetHandle_t i_procTgt ):
+ chipReportingError( i_procTgt ),
+ phypDetectedFault( false ),
+ isActiveMdmt( false ),
+ isBackupMdmt( false )
+ {
+ faultDetected[0] = false;
+ faultDetected[1] = false;
+ isMdmtAndFaulty[0] = false;
+ isMdmtAndFaulty[1] = false;
+ activeTopologyIsPrimary = false;
+ chipSourcingClk[0] = NULL;
+ chipSourcingClk[1] = NULL;
+ activeMasterPathPosition[0] = 0;
+ activeMasterPathPosition[1] = 0;
+ }
+};
+
+/** @struct TodFaultData
+ * System TOD failover status
+ */
+struct TopologySwitchDetails
+{
+ bool masterPathHwFailOver; // hw failover status of master path
+ bool phypSwitchedTopology; // topology switch status by Phyp
+
+ /**
+ * @brief Constructor
+ */
+ TopologySwitchDetails():
+ masterPathHwFailOver( false ),
+ phypSwitchedTopology( false )
+ {}
+};
+
+namespace Proc
+{
+/**
+ * @brief Captures all the tod registers of all functional Proc chips.
+ * @param i_stepcode The step code data struct
+ * @return SUCCESS.
+ */
+int32_t todCaptureRegisters( STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+ ProcDomain * l_procDomain =
+ (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );
+
+ for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
+ {
+ RuleChip * l_chip = l_procDomain->LookUp( i );
+ l_chip->CaptureErrorData( i_stepcode.service_data->GetCaptureData(),
+ Util::hashString( "TODReg" ) );
+ }
+ return SUCCESS;
+}
+
+/**
+ * @brief Clears Tod errors register and Tod error bits in TP_LFIR
+ * @param i_stepcode The step code data struct
+ * @return Non-SUCCESS if an internal function fails, SUCCESS otherwise.
+ */
+int32_t todCleanUpErrors( STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+ #define PRDF_FUNC "[Proc::todCleanUpErrors] "
+
+ uint32_t o_rc = SUCCESS;
+
+#ifdef __HOSTBOOT_RUNTIME
+ ProcDomain * l_procDomain =
+ (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );
+
+ for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
+ {
+ int32_t l_rc = SUCCESS;
+ RuleChip * l_procChip = l_procDomain->LookUp( i );
+
+ // Clear bits 14,15,16,17,21,39 in TOD Error Register
+ // Bits in this register are cleared by writing 1
+ SCAN_COMM_REGISTER_CLASS * l_todError =
+ l_procChip->getRegister( "TOD_ERRORREGISTER" );
+
+ l_rc = l_todError->Read();
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
+ "proc=0x%08x", l_procChip->GetId() );
+
+ // Continue to try clearing the other chips
+ o_rc = FAIL;
+ continue;
+ }
+
+ uint64_t l_val = l_todError->GetBitFieldJustified( 0, 64 );
+ l_val = l_val & 0x0003C40001000000ull; // bits 14,15,16,17,21,39
+
+ if ( 0 != l_val )
+ {
+ l_todError->SetBitFieldJustified( 0, 64, l_val );
+ l_rc = l_todError->Write();
+
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Write() failed on TOD_ERRORREGISTER: "
+ "proc=0x%08x", l_procChip->GetId() );
+ o_rc = FAIL;
+ continue;
+ }
+ }
+
+
+ // Next read shall cause Force Read
+ RegDataCache & regCache = RegDataCache::getCachedRegisters();
+ regCache.flush( l_procChip, l_todError );
+
+ // Clear bits 18 and 20 in TPLFIR
+ SCAN_COMM_REGISTER_CLASS * l_andTpFir =
+ l_procChip->getRegister( "TP_LFIR_AND" );
+
+ l_andTpFir->setAllBits();
+ l_andTpFir->ClearBit(18);
+ l_andTpFir->ClearBit(20);
+
+ l_rc = l_andTpFir->Write();
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Write() failed on TP_LFIR_AND: "
+ "proc=0x%08x", l_procChip->GetId() );
+ o_rc = FAIL;
+ continue;
+ }
+ }
+#endif
+ return o_rc;
+
+ #undef PRDF_FUNC
+}
+
+/**
+ * @brief Investigates if there is a failover initiated by HW.
+ * @param i_chip chip reporting TOD errors
+ * @param io_faultData Tod fault info
+ * @param o_failoverStatus failover status
+ */
+void checkForHwInitiatedFailover( ExtensibleChip * i_chip,
+ TodFaultData & io_faultData,
+ TopologySwitchDetails & o_failoverStatus )
+{
+ #define PRDF_FUNC "[Proc::checkForHwInitiatedFailover] "
+
+ // This function detects whether an MDMT chip has switched its master path
+ // due to a clock fault. In this case, PRD gets an attention due to a step
+ // check error in Master Path 0. The failover modifies bit 12 of the TOD
+ // status register. PRD finds that both active and backup topolgy use the
+ // same master path (path 1). When PRD checks for faults on each topology
+ // we'll be looking at path 1 for both and find no faults there. So this
+ // function checks for the master patch failover case and marks the MDMT
+ // chip at fault appropriately.
+
+ do
+ {
+ if( false == io_faultData.isActiveMdmt ||
+ false == io_faultData.isBackupMdmt )
+ {
+ // don't consider slave procs for this check
+ break;
+ }
+
+ // Is MDMT in a failover state.
+ if(( false == io_faultData.isMdmtAndFaulty[0] &&
+ false == io_faultData.isMdmtAndFaulty[1] ))
+
+ {
+ // Get TOD Error register.
+ SCAN_COMM_REGISTER_CLASS * l_todError =
+ i_chip->getRegister("TOD_ERRORREGISTER");
+
+ uint32_t l_oscPos = 1;
+
+ if ( SUCCESS != l_todError->Read() )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
+ "i_chip=0x%08x", i_chip->GetId() );
+ break;
+ }
+
+ if( l_todError->IsBitSet(14) )
+ {
+ l_oscPos = 0;
+ }
+
+ else if( !l_todError->IsBitSet(15))
+ {
+ break;
+ }
+
+ // We failed to capture a TOD error in master path. This implies
+ // a HW path failover has occurred.
+ o_failoverStatus.masterPathHwFailOver = true;
+
+ uint32_t topPos =
+ ( true == o_failoverStatus.phypSwitchedTopology )? 1 : 0;
+
+ io_faultData.faultDetected[topPos] = true;
+ io_faultData.isMdmtAndFaulty[topPos] = true;
+ io_faultData.activeMasterPathPosition[topPos] = l_oscPos;
+
+ PRDF_TRAC( PRDF_FUNC "HW Initiated failover: MDMT 0x%08x "
+ "faulty, mpath pos: %d", i_chip->GetId(),
+ l_oscPos );
+ }
+
+ }while(0);
+
+ #undef PRDF_FUNC
+}
+
+/**
+ * @brief Analyzes the TOD error of a given proc
+ * @param i_chip chip reporting TOD errors
+ * @param o_faults list of Tod fault info
+ * @param i_stepcode The step code data struct
+ * @param io_failOverStatus topology failover status
+ * @return SUCCESS.
+ */
+int32_t todCollectFaultDataChip( ExtensibleChip * i_chip,
+ std::vector<TodFaultData> & o_faults,
+ STEP_CODE_DATA_STRUCT & i_stepcode,
+ TopologySwitchDetails & io_failOverStatus )
+{
+ #define PRDF_FUNC "[Proc::todCollectFaultDataChip] "
+
+ TargetHandle_t l_chipTarget = i_chip->GetChipHandle();
+ TodFaultData l_faultData ( l_chipTarget );
+
+ uint32_t l_rc = FAIL;
+
+ do
+ {
+ // Check if PHYP reported TOD error
+ SCAN_COMM_REGISTER_CLASS * l_pTpLFir = i_chip->getRegister( "TP_LFIR" );
+
+ l_rc = l_pTpLFir->Read();
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on TP_LFIR: i_chip=0x%08x",
+ i_chip->GetId() );
+ break;
+ }
+
+ l_faultData.phypDetectedFault = l_pTpLFir->IsBitSet(20);
+
+ // Deterimine active topology.
+ SCAN_COMM_REGISTER_CLASS * l_todStatus =
+ i_chip->getRegister("TOD_STATUSREGISTER");
+
+ l_rc = l_todStatus->Read();
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on TOD_STATUSREGISTER: "
+ "i_chip=0x%08x", i_chip->GetId() );
+ break;
+ }
+
+ //Reading TOD_STATUSREGISTER[0:2]
+ //0b000 means configuration chosen is Primary
+ //0b111 means configuration chosen is Secondary
+
+ bool l_activeIsPrimary =
+ ( 0 == l_todStatus->GetBitFieldJustified( 0, 3 ) );
+ l_faultData.activeTopologyIsPrimary = l_activeIsPrimary;
+
+ // Get TOD Error register.
+ SCAN_COMM_REGISTER_CLASS * l_todError =
+ i_chip->getRegister("TOD_ERRORREGISTER");
+
+ l_rc = l_todError->Read();
+ if ( SUCCESS != l_rc )
+ {
+ PRDF_ERR( PRDF_FUNC"Read() failed on TOD_ERRORREGISTER: "
+ "i_chip=0x%08x", i_chip->GetId() );
+ break;
+ }
+
+ // Check both topologies, active first.
+ for ( int i = 0; i < 2; i++ )
+ {
+ // Each chip has 2 TOD topologies configured (primary and secondary)
+ // One of these is selected as active topology and one as backup
+ // In TodFaultData, index 0 is used for the active topology, and 1
+ // for the backup. We also need to know whether we looking at the
+ // primary or secondary topology, because that will determine
+ // the bit positions we used in the TOD registers.
+ // So within this for loop, index 0/1 refers to active/backup
+ // l_topIsPri identifies whether the current topo was configured
+ // in the primary or secondary position.
+
+ bool l_topIsPri =
+ ( ( 0 == i ) ? l_activeIsPrimary : !l_activeIsPrimary );
+
+ bool l_masterTodSelected = false ;
+ bool l_masterDrawerSelected = false;
+
+ // Check if MDMT on current topology.
+ l_masterTodSelected =
+ l_todStatus->IsBitSet( l_topIsPri ? 13 : 17 );
+ l_masterDrawerSelected =
+ l_todStatus->IsBitSet( l_topIsPri ? 14 : 18 );
+
+ // Check master OSC status if MDMT
+ if ( ( l_masterTodSelected ) && ( l_masterDrawerSelected ) )
+ {
+ // Deterimine which OSC card is used.
+ bool l_osc0; //means master path 0
+ bool l_oscFail;
+
+ l_faultData.isActiveMdmt = l_todStatus->IsBitSet(23);
+ l_faultData.isBackupMdmt = l_todStatus->IsBitSet(24);
+
+ l_osc0 = !l_todStatus->IsBitSet( l_topIsPri ? 12 : 16 );
+ l_faultData.activeMasterPathPosition[i] = l_osc0 ? 0 : 1;
+
+ // Read step check error bit in TOD error register
+ l_oscFail = l_todError->IsBitSet( l_osc0 ? 14 : 15 );
+
+ if ( l_oscFail )
+ {
+ // Set fault data.
+ l_faultData.faultDetected[i] = true;
+ l_faultData.isMdmtAndFaulty[i] = true;
+
+ PRDF_TRAC(PRDF_FUNC " MDMT: 0x%08x at Error, M-Path: %d, "
+ "topology: %c",
+ i_chip->GetId(), l_osc0 ? 0 : 1,
+ i == 0 ?'A':'B' );
+ }
+
+ }//if mdmt
+
+ else // Is not MDMT on this topology.
+ {
+ // Deterimine whether slave chip is using Primary configuration
+ // slave path (slave path 0 )or secondary configuration slave
+ //path (slave path 1 )
+ bool l_slv0 = !l_todStatus->IsBitSet( l_topIsPri ? 15 : 19 );
+
+ // Check if TOD slave path has any step check error.
+ // bit 16 and 21 of TOD_ERRORREGISTER indicate if there is any
+ // TOD Error in slave path.
+
+ bool l_slvErr = l_todError->IsBitSet( l_slv0 ? 16 : 21 );
+
+ // If there is Step Check Error, we must determine proc sourcing
+ // clock to the chip reporting step check error. We do this by
+ // reading PCRP0 for primary configuration and SCRP1 for
+ // secondary configuration to determine which bus is being used
+ // to transmit tod clock. We can use that to get the peer proc
+ // at the other end of the bus.
+
+ if ( l_slvErr )
+ {
+ uint32_t l_connection = 0;
+ TargetHandle_t l_procClockSrc = NULL;
+
+ uint32_t l_ret = FAIL;
+#ifdef __HOSTBOOT_RUNTIME
+ l_ret = getTodPortControlReg( l_chipTarget, l_slv0,
+ l_connection );
+#endif
+ if( SUCCESS != l_ret ) continue;
+
+ // The connection value is in bits 0:2. The scomdef doesn't
+ // define this very well:
+ // X0_PORT_0=>0b000
+ // X1_PORT_0=>0b001
+ // X2_PORT_0=>0b010
+ // X3_PORT_0=>0b011
+ // X4_PORT_0=>0b100
+ // X5_PORT_0=>0b101
+ // X6_PORT_0=>0b110
+ // X7_PORT_0=>0b111
+ // I've been told the actual definition is 0-2 for XBUS0-2
+ // 3-6 for OBUS0-3, port 7 unused.
+
+ l_connection >>= 29;
+ if ( l_connection > 6 )
+ {
+ PRDF_ERR( PRDF_FUNC"Configuration error for 0x%08x "
+ "connection 0x%08x", getHuid(l_chipTarget),
+ l_connection );
+ continue;
+ }
+ else
+ {
+ TYPE l_busType = TYPE_XBUS;
+ if ( l_connection > 2 )
+ {
+ l_busType = TYPE_OBUS;
+ l_connection -= 3;
+ }
+
+ l_procClockSrc = getConnectedPeerProc( l_chipTarget,
+ l_busType,
+ l_connection );
+ }
+
+ if( NULL == l_procClockSrc )
+ {
+ l_procClockSrc = l_chipTarget;
+ }
+
+ // Set fault data.
+ l_faultData.faultDetected[i] = true;
+ l_faultData.chipSourcingClk[i] = l_procClockSrc;
+
+ PRDF_TRAC( PRDF_FUNC " Slave 0x%08x at Error S-Path %d,"
+ "topology %c, clk source is 0x%08x",
+ i_chip->GetId(), l_slv0 ? 0:1,
+ i == 0 ? 'A':'B',
+ getHuid( l_procClockSrc ) );
+
+ } // error in slave
+ }//else not mdmt
+ }//for topology
+
+ checkForHwInitiatedFailover( i_chip, l_faultData, io_failOverStatus );
+
+ // Check for an internal path error in active topology
+ uint32_t topPos = io_failOverStatus.phypSwitchedTopology ? 1 : 0;
+ if ( !l_faultData.faultDetected[topPos] && l_todError->IsBitSet(17) )
+ {
+ l_faultData.faultDetected[topPos] = true;
+ l_faultData.chipSourcingClk[topPos] = l_chipTarget;
+ }
+
+ o_faults.push_back( l_faultData );
+
+ l_rc = SUCCESS;
+
+ } while(0);
+
+ return l_rc;
+
+ #undef PRDF_FUNC
+}
+
+/**
+ * @brief Collects TOD fault error info for all procs in the system
+ * @param i_chip chip reporting TOD errors
+ * @param i_stepcode The step code data struct
+ * @param io_FailoverStatus hw initiated failover status
+ */
+void todCollectFaultDataSys( std::vector<TodFaultData> & o_faults,
+ STEP_CODE_DATA_STRUCT & i_stepcode,
+ TopologySwitchDetails & io_FailoverStatus )
+{
+ ProcDomain * l_procDomain =
+ (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );
+
+ for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
+ {
+ RuleChip * l_chip = l_procDomain->LookUp( i );
+ uint32_t l_rc = todCollectFaultDataChip( l_chip, o_faults,
+ i_stepcode,
+ io_FailoverStatus );
+ if( SUCCESS != l_rc )
+ {
+ PRDF_ERR("[todCollectFaultDataSys] Failed to analyze tod errors in"
+ "chip 0x%08x",l_chip->GetId() );
+ }
+
+ }
+}
+
+/**
+ * @brief Determines if Phyp switched the topology.
+ * @return o_topologySwitch topology switch status
+ */
+bool checkPhypSwitchedTopology( )
+{
+ #define PRDF_FUNC "[checkPhypSwitchedTopology] "
+
+ bool o_topologySwitch = false;
+
+ ProcDomain * l_procDomain =
+ (ProcDomain*)systemPtr->GetDomain( PROC_DOMAIN );
+
+ for( size_t i = 0; i < l_procDomain->GetSize(); i++ )
+ {
+ RuleChip * l_chip = l_procDomain->LookUp( i );
+ // Get TOD Error register.
+ SCAN_COMM_REGISTER_CLASS * l_todError =
+ l_chip->getRegister("TOD_ERRORREGISTER");
+
+ if( SUCCESS != l_todError->Read() )
+ {
+ PRDF_ERR( PRDF_FUNC"Read failed for tod error "
+ "register on 0x%08x", l_chip->GetId() );
+ break;
+ }
+
+ o_topologySwitch = l_todError->IsBitSet(39);
+
+ if( true == o_topologySwitch )
+ {
+ break;
+ }
+ }
+
+ return o_topologySwitch;
+ #undef PRDF_FUNC
+}
+
+/**
+ * @brief Collects FFDC associated with step errors.
+ * @param io_todErrorData contains fault status and data for all chips.
+ * @param i_failOverstatus contains master path and topology failover data.
+ * @param o_errorSummary contains FFDC associated with step errors.
+ */
+void collectTodErrorFfdc( std::vector<TodFaultData> & io_todErrorData,
+ TopologySwitchDetails i_failOverstatus,
+ TodErrorSummary & o_errorSummary )
+{
+ std::vector<TodFaultData> faultyChip;
+ memset( &o_errorSummary, 0x00, sizeof(TodErrorSummary) );
+
+ for ( auto & i : io_todErrorData )
+ {
+ if ( i.phypDetectedFault )
+ {
+ o_errorSummary.phypDetectedTodError = 1;
+ }
+
+ if( i.isActiveMdmt )
+ {
+ o_errorSummary.activeMdmt = getHuid( i.chipReportingError );
+ o_errorSummary.activeTopology =
+ i.activeTopologyIsPrimary ? 1 : 0;
+ // master path position selected for active MDMT
+ o_errorSummary.activeTopologyMastPath =
+ i.activeMasterPathPosition[0];
+ }
+
+ if( i.isBackupMdmt )
+ {
+ o_errorSummary.backUpMdmt = getHuid( i.chipReportingError );
+ // master path position selected for backup MDMT
+ o_errorSummary.backUpTopologyMastPath =
+ i.activeMasterPathPosition[1];
+ }
+
+ // Add to list if some error is detected.
+ if ( i.phypDetectedFault || i.faultDetected[0] ||
+ i.faultDetected[1] )
+ {
+ faultyChip.push_back( i );
+ }
+ }
+ o_errorSummary.topologySwitchByPhyp =
+ i_failOverstatus.phypSwitchedTopology ? 1 :0 ;
+
+ o_errorSummary.hardwareSwitchFlip =
+ i_failOverstatus.masterPathHwFailOver ? 1 : 0;
+ o_errorSummary.reserved = 0;
+
+ io_todErrorData.empty();
+ io_todErrorData = faultyChip;
+}
+
+/**
+ * @brief Adds FFDC associated with step error as Capture data.
+ * @param i_stepcode Step Code Data Struct.
+ * @param i_chip Chip reporting TOD step error.
+ * @param i_errorSummary contains FFDC associated with step error.
+ */
+void addFfdcToCaptureData( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & i_stepcode,
+ TodErrorSummary & i_errorSummary )
+{
+ size_t sz_w = sizeof(CPU_WORD);
+ size_t sz_t =
+ ((sizeof(TodErrorSummary) + sz_w - 1) / sz_w ) * sz_w;
+ uint8_t errorDataBuff[sz_t];
+ memset( &errorDataBuff, 0x00, sz_t );
+ memcpy( &errorDataBuff, &i_errorSummary, sizeof(TodErrorSummary) );
+
+ #if( __BYTE_ORDER == __LITTLE_ENDIAN )
+
+ for( uint32_t i = 0; i < sz_t / sz_w; i++ )
+ {
+ ((CPU_WORD *)errorDataBuff)[i] =
+ htonl(( (CPU_WORD *) errorDataBuff)[i]);
+ }
+
+ #endif
+
+ BitString bs( sz_t * 8, (CPU_WORD *) & errorDataBuff );
+
+ CaptureData & cd = i_stepcode.service_data->GetCaptureData();
+ cd.Add( i_chip->GetChipHandle(), Util::hashString("TOD_ERROR_DATA"), bs );
+}
+
+/**
+ * @brief Analyzes the step check error of all procs in the system
+ * @param i_chip chip reporting TOD errors
+ * @param i_stepcode The step code data struct
+ * @return SUCCESS.
+ */
+int32_t todStepCheckFault( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+ #define PRDF_FUNC "[Proc::todStepCheckFault] "
+
+ // When we analyze a step check fault, we will look at all chips in the
+ // system--both topologies. After we've collected TOD fault data on each
+ // chip, we will categorize the failure as:
+ // - MDMT Clock problem
+ // - Internal path error
+ // - Connection error between chips
+ // In case of connection error,we try to minimize the list of chips to the
+ // list of most probable chips causing TOD errors. Once all the chips at
+ // fault are isolated, hwsv is requested to create a new back up topology.
+
+ // Collect TOD registers for FFDC.
+ todCaptureRegisters( i_stepcode );
+
+ // Collect TOD fault data.
+ std::vector<TodFaultData> l_faultData;
+
+ // List of chips for HWSV to avoid when constructing a new backup topo
+ std::vector< TargetHandle_t > l_chipBlackList;
+
+ // Osc for HWSV to avoid when constructing a new backup topology
+ // Since HB doesn't model osc targets, we need a proc and Osc position
+ TargetHandle_t procOscTgtBl = nullptr; // Proc target assoc with bad Osc
+ uint32_t oscPosBl = 0xFFFFFFFF; // Osc position relative to proc
+
+ TopologySwitchDetails failOverstatus;
+ failOverstatus.phypSwitchedTopology = checkPhypSwitchedTopology( );
+ todCollectFaultDataSys( l_faultData, i_stepcode, failOverstatus );
+ TodErrorSummary todErrorFfdc;
+ collectTodErrorFfdc( l_faultData, failOverstatus, todErrorFfdc );
+
+ bool l_phypError = false;
+ TargetHandle_t mdmtList[2] = {NULL, NULL };
+ uint8_t mdmtFailedOscPos[2] = {0xFF, 0xFF};
+ uint8_t analysisSummary[2] = { NO_TOD_ERROR, NO_TOD_ERROR };
+ bool l_allInternal = true;
+ bool l_foundFault = false;
+
+ // Find MDMT chips at fault
+ for ( std::vector<TodFaultData>::iterator i = l_faultData.begin();
+ i != l_faultData.end(); i++ )
+ {
+ if ( i->phypDetectedFault )
+ {
+ l_phypError = true;
+ }
+
+ for ( int t = 0; t < 2; t++ )
+ {
+ if( i->isMdmtAndFaulty[t] )
+ {
+ mdmtList[t] = i->chipReportingError;
+ mdmtFailedOscPos[t] = i->activeMasterPathPosition[t];
+ }
+ }
+ }
+
+ if ( l_phypError )
+ {
+ i_stepcode.service_data->SetThresholdMaskId(0);
+ }
+
+ // Look at both topologies.
+ for ( int i = 0; i < 2; i++ )
+ {
+ // Classifications of topology errors:
+ // 1) MDMT clock problem - callout clock or MDMT.
+ // 2) Internals only - callout chips.
+ // 3) Network error - clear internals, and isolate.
+
+ // MDMT analysis
+
+ if( NULL != mdmtList[i] )
+ {
+ // HW initiated failover. Callout the failed OSC.
+ if ( failOverstatus.masterPathHwFailOver )
+ {
+ i_stepcode.service_data->SetThresholdMaskId(0);
+ }
+ // Add Osc to blacklist
+ procOscTgtBl = mdmtList[i];
+ oscPosBl = mdmtFailedOscPos[i];
+
+ // Add Proc to blacklist
+ l_chipBlackList.push_back( mdmtList[i] );
+
+ // Callout and gard TOD OSC
+#ifdef __HOSTBOOT_MODULE
+ errlHndl_t errl =
+ ServiceGeneratorClass::ThisServiceGenerator().getErrl();
+ if ( NULL == errl )
+ {
+ PRDF_ERR( PRDF_FUNC "Failed to get the global error log" );
+ break;
+ }
+ errl->addClockCallout( mdmtList[i], HWAS::TODCLK_TYPE,
+ HWAS::SRCI_PRIORITY_HIGH,
+ HWAS::DECONFIG,
+ HWAS::GARD_Predictive );
+#else
+ TargetHandle_t l_clockTarget = nullptr;
+ l_clockTarget = getConnectedChild( procOscTgtBl,
+ TYPE_TODCLK,
+ oscPosBl );
+ if (l_clockTarget)
+ i_stepcode.service_data->SetCallout( l_clockTarget, MRU_HIGH );
+#endif
+ // Callout MDMT chip
+ i_stepcode.service_data->SetCallout(mdmtList[i], MRU_MEDA );
+
+ //callout a symbolic FRU to replace FRU/interfaces between Proc and
+ //TOD OSC card
+ i_stepcode.service_data->SetCallout( TOD_CLOCK_ERR, MRU_MED,
+ NO_GARD );
+ analysisSummary[i] = MASTER_PATH_ERROR;
+
+ // We have analyzed this topology to an MDMT fault, move on to the
+ // backup topology
+ continue;
+ }
+
+ // Collect some information for further classification
+ for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
+ j != l_faultData.end(); j++ )
+ {
+ // If fault on topology.
+ if ( j->faultDetected[i] )
+ {
+ l_foundFault = true;
+
+ // Check if non-internal fault.
+ if( j->chipSourcingClk[i] != j->chipReportingError )
+ {
+ // ignore internal path errors during hw failover.
+ l_allInternal = false;
+ }
+ }
+ }
+
+ // Skip analysis if this topology has nothing.
+ if ( !l_foundFault )
+ {
+ continue;
+ }
+
+ if ( l_allInternal ) // Internal callouts.
+ {
+
+ for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
+ j != l_faultData.end(); j++ )
+ {
+ if ( j->chipSourcingClk[i] == j->chipReportingError )
+ {
+
+ if ( NULL != j->chipReportingError )
+ {
+ // update consolidated callout list and
+ //black list for internal path errors
+ i_stepcode.service_data->SetCallout(
+ j->chipReportingError,MRU_MED );
+ l_chipBlackList.push_back( j->chipReportingError );
+ }
+ }
+ }
+
+ analysisSummary[i] = INTERNAL_PATH_ERROR;
+ }
+ else // Network callout.
+ {
+ // Clear all internal reports and get chips.
+ for ( std::vector<TodFaultData>::iterator j = l_faultData.begin();
+ j != l_faultData.end(); j++ )
+ {
+ if ( j->chipSourcingClk[i] == j->chipReportingError )
+ {
+ j->faultDetected[i] = false;
+ }
+ }
+
+ TargetHandleList l_rootList;
+ std::vector<TodFaultData>::iterator itSrc;
+
+ for( itSrc = l_faultData.begin(); itSrc != l_faultData.end();
+ itSrc++ )
+ {
+ std::vector<TodFaultData>::iterator itReport;
+ bool l_badSrc = false;
+
+ if( !itSrc->faultDetected[i] )
+ continue;
+
+ for( itReport = l_faultData.begin();
+ itReport != l_faultData.end();
+ itReport++ )
+ {
+ // If proc A is getting its tod clock from proc B and both
+ // are reporting step check errors, we callout only B.
+ if( itSrc->chipSourcingClk[i] ==
+ itReport->chipReportingError )
+ {
+ if ( true == itReport->faultDetected[i] )
+ {
+ l_badSrc = true;
+ l_rootList.push_back(itReport->chipReportingError);
+
+ PRDF_TRAC( PRDF_FUNC "Network callout adding clk"
+ "source chip 0x%08x topology %c",
+ getHuid(itReport->chipReportingError ),
+ i == 0 ? 'A':'B' );
+ }
+ break;
+ }
+ }
+
+ if( !l_badSrc )
+ {
+ l_rootList.push_back( itSrc->chipReportingError );
+ PRDF_TRAC( PRDF_FUNC "Network callout adding chip 0x%08x "
+ "i = %c", getHuid( itSrc->chipReportingError ),
+ i == 0 ? 'A':'B' );
+ }
+ }
+
+ // Sort, remove unique.
+ std::sort( l_rootList.begin(), l_rootList.end() );
+ std::vector<TargetHandle_t>::iterator itChip;
+ itChip = std::unique(l_rootList.begin(), l_rootList.end());
+ l_rootList.erase( itChip,l_rootList.end() );
+
+ //Calling out the final list of chips reporting connection
+ //problem in TOD network.
+ for ( auto &failedChip : l_rootList )
+ {
+ // update the consolidated callout list and
+ // black list for hwsv
+ i_stepcode.service_data->SetCallout( failedChip, MRU_MED );
+ l_chipBlackList.push_back( failedChip );
+ } //for l_rootList
+
+ analysisSummary[i] = SLAVE_PATH_NETWORK_ERROR;
+
+ }// else network error
+
+ }//for topology
+
+ std::sort( l_chipBlackList.begin(), l_chipBlackList.end() );
+ std::vector<TargetHandle_t>::iterator itBlackList;
+ itBlackList = std::unique( l_chipBlackList.begin(), l_chipBlackList.end());
+ l_chipBlackList.erase( itBlackList, l_chipBlackList.end() );
+
+ // Now we call HWSV to create a new backup topology. The chips in the black
+ // list will not be selected as the new MDMT.
+#ifdef __HOSTBOOT_RUNTIME
+ todErrorFfdc.topologyResetRequested = 0;
+ if ( i_stepcode.service_data->IsAtThreshold() )
+ {
+ requestNewTODTopology( oscPosBl, procOscTgtBl,
+ l_chipBlackList, !l_phypError );
+ todErrorFfdc.topologyResetRequested = 1;
+ }
+#endif
+
+ // If we never made a callout, call out this chip.
+ if ( 0 == i_stepcode.service_data->getMruListSize() )
+ {
+ i_stepcode.service_data->SetCallout( i_chip->GetChipHandle() );
+ analysisSummary[0] = UNKNOWN_TOD_ERROR;
+ analysisSummary[1] = UNKNOWN_TOD_ERROR;
+ }
+
+ // Clean up all TOD error reports.
+ if ( SUCCESS != todCleanUpErrors( i_stepcode ) )
+ {
+ PRDF_ERR(PRDF_FUNC "Failed to clear TOD Errors of the"
+ "System" );
+ }
+
+ for( auto &blChip : l_chipBlackList )
+ {
+ PRDF_TRAC( PRDF_FUNC"black list chip HUID: 0x%08x ",
+ getHuid( blChip ) );
+ }
+
+ if (procOscTgtBl)
+ {
+ PRDF_TRAC( PRDF_FUNC "black list osc chip HUID 0x%08x Pos %d",
+ getHuid(procOscTgtBl), oscPosBl );
+ }
+
+ // At last, add FFDC as capture data to error log
+ todErrorFfdc.activeTopologySummary = analysisSummary[0];
+ todErrorFfdc.backUpTopologySummary = analysisSummary[1];
+ addFfdcToCaptureData( i_chip, i_stepcode, todErrorFfdc );
+
+ return SUCCESS;
+
+ #undef PRDF_FUNC
+}
+PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, todStepCheckFault );
+PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todStepCheckFault );
+
+/**
+ * @brief Request for creation of a new back up topology.
+ * @param i_chip chip reporting TOD errors
+ * @param i_stepcode The step code data struct
+ * @return SUCCESS.
+ */
+int32_t todNewTopologyIfBackupMDMT( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+#ifdef __HOSTBOOT_RUNTIME
+ do
+ {
+ SCAN_COMM_REGISTER_CLASS * l_todStatus =
+ i_chip->getRegister( "TOD_STATUSREGISTER" );
+
+ if( SUCCESS != l_todStatus->Read( ) )
+ {
+ PRDF_ERR("[todNewTopologyIfBackupMDMT] Failed to read TOD status"
+ "register, address 0x%16llx of proc 0x%08x ",
+ l_todStatus->GetAddress(),i_chip->GetId() );
+ break;
+ }
+
+ bool primaryIsActive = !( 0 == l_todStatus->GetBitFieldJustified( 0,3 ) );
+
+ /* Check this chips role
+ * Topology - 1
+ *
+ * TOD_STATUS[13] TOD_STATUS[14] Inference
+ * 1 1 Mster TOD Master Drawer
+ * 0 1 Slave TOD Master Drawer
+ * 0 0 Slave TOD Slave Drawer
+ * 1 0 Master TOD Slave Drawer
+
+ * Topology - 2
+ * TOD_STATUS[17] TOD_STATUS[18] Inference
+ *
+ * Truth Table is same as above
+ */
+
+ // Check for MDMT status.
+ bool l_masterTodSelect;
+ bool l_masterDrawerSelect;
+ l_masterTodSelect = l_todStatus->IsBitSet(
+ 13 + ( primaryIsActive ? 0 : 4 ) );
+ l_masterDrawerSelect = l_todStatus->IsBitSet(
+ 14 + ( primaryIsActive ? 0 : 4 ) );
+
+ // If this is the MDMT then request a new topology.
+ if( ( l_masterTodSelect ) && ( l_masterDrawerSelect ) )
+ {
+ TargetHandleList badChipList;
+ badChipList.push_back( i_chip->GetChipHandle() );
+ requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, false );
+ }
+
+ } while(0);
+#endif
+ return SUCCESS;
+}
+PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, todNewTopologyIfBackupMDMT );
+PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, todNewTopologyIfBackupMDMT );
+
+
+/**
+ * @brief Requests for a toplogy switch in response to logic parity error.
+ * @param i_chip chip reporting TOD logic parity error.
+ * @param i_stepcode The step code data struct
+ * @return SUCCESS.
+ */
+int32_t requestTopologySwitch( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+#ifdef __HOSTBOOT_RUNTIME
+ if ( i_stepcode.service_data->IsAtThreshold() )
+ {
+ // Reconfigure the TOD topology and let PHYP know when backup is good.
+ TargetHandleList badChipList;
+ badChipList.push_back( i_chip->GetChipHandle( ) );
+ requestNewTODTopology( 0xFFFFFFFF, nullptr, badChipList, true );
+ }
+#endif
+ return SUCCESS;
+}
+PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, requestTopologySwitch );
+PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, requestTopologySwitch );
+
+/**
+ * @brief Checks if TOD error analysis is disabled on platform.
+ * @param i_chip chip reporting TOD error.
+ * @param i_stepcode The step code data struct.
+ * @return SUCCESS if TOD analysis is disabled
+ */
+int32_t isTodDisabled( ExtensibleChip * i_chip,
+ STEP_CODE_DATA_STRUCT & i_stepcode )
+{
+ int32_t o_rc = SUCCESS;
+
+ if ( isHyprConfigOpal() )
+ {
+ // On OPAL machine, mask TOD errors on first instance. There
+ // should not be any service action.
+ i_stepcode.service_data->setFlag( ServiceDataCollector::AT_THRESHOLD );
+ i_stepcode.service_data->clearServiceCall();
+ o_rc = SUCCESS; // TOD fault analysis not supported
+ }
+ else if ( isHyprRunning() && isHyprConfigPhyp() &&
+ !isMfgAvpEnabled() && !isMfgHdatAvpEnabled() )
+ {
+ o_rc = FAIL; // TOD Fault analysis is supported
+ }
+ else
+ {
+ i_stepcode.service_data->SetCallout( LEVEL2_SUPPORT, MRU_MED, NO_GARD );
+ i_stepcode.service_data->SetCallout( SP_CODE, MRU_MED, NO_GARD );
+ o_rc = SUCCESS; // TOD fault analysis not supported
+ }
+
+ return o_rc;
+}
+PRDF_PLUGIN_DEFINE_NS( p9_nimbus, Proc, isTodDisabled );
+PRDF_PLUGIN_DEFINE_NS( p9_cumulus, Proc, isTodDisabled );
+
+} //namespace Proc ends
+
+} //namespace PRDF ends
diff --git a/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk b/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk
index e4d67b930..6b6cf11e1 100644
--- a/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk
+++ b/src/usr/diag/prdf/common/plat/p9/prdf_plat_p9.mk
@@ -53,3 +53,4 @@ prd_rule_plugin += prdfLaneRepair.o
prd_rule_plugin += prdfP9Ex.o
prd_rule_plugin += prdfP9Ec.o
prd_rule_plugin += prdfP9Eq.o
+prd_rule_plugin += prdfP9TodPlugins.o
diff --git a/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C b/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C
index 725b126cf..66a3339b0 100644
--- a/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C
+++ b/src/usr/diag/prdf/common/plugins/prdfLogParse_common.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2013,2017 */
+/* Contributors Listed Below - COPYRIGHT 2013,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -478,15 +478,11 @@ bool parseCaptureData( void * i_buffer, uint32_t i_buflen,
{
parseTdCtlrStateData( sigData, sigDataSize, i_parser, sigId );
}
-/*TODO: RTC 136050
- else if ( Util::hashString(SLW_FFDC_DATA::title) == sigId )
- {
- parseSlwFfdcData( sigData, sigDataSize, i_parser );
- }
else if ( Util::hashString("TOD_ERROR_DATA") == sigId)
{
parseTodFfdcData( sigData, sigDataSize, i_parser );
}
+/*
else if ( Util::hashString("OCC_CS_FFDC") == sigId)
{
parsePnorFirData( sigData, sigDataSize, i_parser );
diff --git a/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H b/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H
index de8c6fa81..5b0b78836 100644
--- a/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H
+++ b/src/usr/diag/prdf/common/plugins/prdfPlatProcConst.H
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2015 */
+/* Contributors Listed Below - COPYRIGHT 2015,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -35,28 +35,6 @@ namespace PRDF
namespace TOD
{
-
-//NOTE: Position at which strings representing names of TOD control registers
-//are placed in regStatList cannot be changed independently. Position of string
-//in the array must match value of corresponding enum member e.g. value
-//of MPCR is zero, therefore, TOD_MPCR can only be placed at regStatList[0]
-
-// following enum represents all the tod control and status registers which
-// need to be read and restored during data parity errors.
-
-enum Register
-{
- MPCR = 0,
- PCRP0 = 1,
- PCRP1 = 2,
- SCRP0 = 3,
- SCRP1 = 4,
- SPCR = 5,
- IPCR = 6,
- PSMSCR = 7,
- LAST_TOD_REG = 8,
-};
-
/**
* @brief summarizes error analysis for a TOD topology.
*/
@@ -77,12 +55,11 @@ struct TodErrorSummary
{
#if __BYTE_ORDER == __LITTLE_ENDIAN
- uint32_t reserved :13;
+ uint32_t reserved :17;
uint32_t backUpTopologyMastPath :2; //master path for backup topology
uint32_t activeTopologyMastPath :2; // master path for active topology
uint32_t backUpTopologySummary :3; // backup topology error status
uint32_t activeTopologySummary :3; // active topology error status
- uint32_t todOscCnt :4; // functional TOD OSC count in system
uint32_t activeTopology :1; // Topology acting as Active
uint32_t topologyResetRequested :1; // topology reset request status
uint32_t topologySwitchByPhyp :1; // topology switch event detected
@@ -96,12 +73,11 @@ struct TodErrorSummary
uint32_t topologySwitchByPhyp :1; // topology switch event detected
uint32_t topologyResetRequested :1; // topology reset request status
uint32_t activeTopology :1; // Topology acting as Active
- uint32_t todOscCnt :4; // functional TOD OSC count in system
uint32_t activeTopologySummary :3; // active topology error status
uint32_t backUpTopologySummary :3; // backup topology error status
uint32_t activeTopologyMastPath :2; // master path for active topology
uint32_t backUpTopologyMastPath :2; //master path for backup topology
- uint32_t reserved :13;
+ uint32_t reserved :17;
#endif
uint32_t activeMdmt; // HUID of active mdmt
diff --git a/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C b/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C
index 461704ab9..d33792b77 100644
--- a/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C
+++ b/src/usr/diag/prdf/common/plugins/prdfProcLogParse.C
@@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
-/* Contributors Listed Below - COPYRIGHT 2014,2017 */
+/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
@@ -51,44 +51,6 @@ using namespace PARSER;
using namespace TOD;
//------------------------------------------------------------------------------
-
-bool parseSlwFfdcData( uint8_t * i_buffer, uint32_t i_buflen,
- ErrlUsrParser & i_parser )
-{
- char hdr[HEADER_SIZE] = "";
- char data[DATA_SIZE] = "";
-
- snprintf( hdr, HEADER_SIZE, " %s", SLW_FFDC_DATA::title );
- i_parser.PrintString( hdr, "" );
-
- const size_t sz_word = sizeof(uint32_t);
-
- uint32_t idx = 0;
- while ( idx + SLW_FFDC_DATA::ENTRY_SIZE < i_buflen )
- {
- uint32_t addr, val0, val1;
-
- memcpy( &addr, &i_buffer[idx ], sz_word );
- memcpy( &val0, &i_buffer[idx+(1*sz_word)], sz_word );
- memcpy( &val1, &i_buffer[idx+(2*sz_word)], sz_word );
-
- addr = htonl(addr);
- val0 = htonl(val0);
- val1 = htonl(val1);
-
- snprintf(hdr, HEADER_SIZE, " Address: 0x%08x", addr );
- snprintf(data, DATA_SIZE, "Value: 0x%08x 0x%08x", val0, val1 );
-
- i_parser.PrintString( hdr, data );
-
- idx += SLW_FFDC_DATA::ENTRY_SIZE;
- }
-
- return true;
-}
-
-//------------------------------------------------------------------------------
-
bool parseTodFfdcData( uint8_t * i_buffer, uint32_t i_buflen,
ErrlUsrParser & i_parser )
{
@@ -131,9 +93,6 @@ bool parseTodFfdcData( uint8_t * i_buffer, uint32_t i_buflen,
errorData.activeTopology ?
"Primary Config" : "Secondary Config" );
- i_parser.PrintNumber( "Functional TOD Osc", "0x%08x",
- errorData.todOscCnt );
-
snprintf(data, DATA_SIZE, "0x%08x", errorData.activeMdmt );
i_parser.PrintString( "Active MDMT", data );
diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C
index 05dcfd265..0a504c5ea 100644
--- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.C
+++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.C
@@ -48,6 +48,7 @@
#include <p9_proc_gettracearray.H>
#include <pm_common_ext.H>
#include <p9_stop_api.H>
+#include <rt_todintf.H>
//------------------------------------------------------------------------------
@@ -482,6 +483,76 @@ int32_t pmCallout( TargetHandle_t i_tgt,
o_deadCores = (uint32_t) deadCores;
return SUCCESS;
}
+
+void requestNewTODTopology( uint32_t i_oscPos,
+ const TargetHandle_t& i_procOscTgt,
+ const TargetHandleList& i_badChipList,
+ bool i_informPhyp)
+{
+ #define PRDF_FUNC "[PlatServices::requestNewTODTopology] "
+ if ( i_badChipList.size() > 0 || i_procOscTgt != NULL )
+ {
+ errlHndl_t err = TOD::resetBackupTopology( i_oscPos, i_procOscTgt,
+ i_badChipList, i_informPhyp );
+
+ if (nullptr != err)
+ {
+ PRDF_ERR( PRDF_FUNC " failed. oscPos: %d "
+ "oscTgt: 0x%08x, chip blacklist size: %d",
+ i_oscPos, getHuid(i_procOscTgt), i_badChipList.size() );
+ PRDF_COMMIT_ERRL( err, ERRL_ACTION_REPORT );
+ }
+ }
+ else
+ {
+ PRDF_ERR( PRDF_FUNC "No chips in black list");
+ }
+ #undef PRDF_FUNC
+}
+
+int32_t getTodPortControlReg ( const TARGETING::TargetHandle_t& i_procTgt,
+ bool i_slvPath0, uint32_t &o_regValue )
+{
+ #define PRDF_FUNC "[PlatServices::getTodPortControlReg] "
+ errlHndl_t err = nullptr;
+ int32_t l_rc = SUCCESS;
+ TOD::TodChipDataContainer todRegData;
+ bool foundChip = false;
+ uint32_t ordId = i_procTgt->getAttr<ATTR_ORDINAL_ID>();
+
+ do {
+ err = TOD::readTodProcDataFromFile( todRegData );
+ if ( err )
+ {
+ PRDF_ERR( PRDF_FUNC"failed to get TOD reg data from hwsv. "
+ "i_procTgt 0x%08x", getHuid(i_procTgt) );
+ l_rc = FAIL;
+ PRDF_COMMIT_ERRL( err, ERRL_ACTION_REPORT );
+ break;
+ }
+
+ for ( auto &chip : todRegData )
+ {
+ if ( chip.header.chipID == ordId )
+ {
+ o_regValue = i_slvPath0 ? chip.regs.pcrp0 : chip.regs.scrp1;
+ foundChip = true;
+ break;
+ }
+ }
+
+ if ( !foundChip )
+ {
+ PRDF_ERR( PRDF_FUNC"Could not find TOD chip Data for "
+ "i_procTgt 0x%08x with ordId %d",
+ getHuid(i_procTgt), ordId );
+ l_rc = FAIL;
+ }
+ } while (0);
+
+ return l_rc;
+ #undef PRDF_FUNC
+}
//------------------------------------------------------------------------------
} // end namespace PlatServices
diff --git a/src/usr/diag/prdf/plat/prdfPlatServices_rt.H b/src/usr/diag/prdf/plat/prdfPlatServices_rt.H
index 17f8be9e1..90b24b2c2 100644
--- a/src/usr/diag/prdf/plat/prdfPlatServices_rt.H
+++ b/src/usr/diag/prdf/plat/prdfPlatServices_rt.H
@@ -154,6 +154,30 @@ int32_t pmCallout( TARGETING::TargetHandle_t i_tgt,
std::vector < StopErrLogSectn >& o_ffdcList );
+/**
+ * @brief Requests HWSV to create a new back up topology due to TOD errors
+ * @param i_oscPos Position of failing OSC. 0xFFFFFFFF if not used.
+ * @param i_procOscTgt Proc chip target associated with failing OSC
+ * @param i_badChipList list of bad chips to be avoided for MDMT
+ * @param i_informPhyp Inform PHYP to disable back up topology.
+ */
+void requestNewTODTopology( uint32_t i_oscPos,
+ const TARGETING::TargetHandle_t& i_procOscTgt,
+ const TARGETING::TargetHandleList& i_badChipList,
+ bool i_informPhyp);
+
+/**
+ * @brief Retrieves a saved version of a TOD config register from HWSV.
+ * We use the saved value rather than the live version because this
+ * could have changed in response to the TOD fault we're currently
+ * analyzing.
+ * @param i_procTgt Proc chip target
+ * @param i_slvPath0 Whether we need the register for slave path 0 or 1
+ * @param o_regValue Returns contents of the register
+ * @return non-SUCCESS for failure, SUCCESS otherwise
+ */
+int32_t getTodPortControlReg ( const TARGETING::TargetHandle_t& i_procTgt,
+ bool i_slvPath0, uint32_t &o_regValue );
} // end namespace PlatServices
} // end namespace PRDF
diff --git a/src/usr/diag/prdf/prdf_hb_only.mk b/src/usr/diag/prdf/prdf_hb_only.mk
index 7926b8ece..73e9eaec5 100644
--- a/src/usr/diag/prdf/prdf_hb_only.mk
+++ b/src/usr/diag/prdf/prdf_hb_only.mk
@@ -56,6 +56,8 @@ prd_incpath += ${ROOTPATH}/src/include/usr/fapi2
prd_incpath += ${ROOTPATH}/src/include/usr/ibscom
prd_incpath += ${ROOTPATH}/src/include/usr/util
prd_incpath += ${ROOTPATH}/src/include/usr/isteps/pm/
+prd_incpath += ${ROOTPATH}/src/include/usr/isteps/tod/
+prd_incpath += ${ROOTPATH}/src/include/usr/isteps/tod/runtime/
prd_incpath += ${ROOTPATH}/src/import/chips/centaur/common/include
prd_incpath += ${ROOTPATH}/src/import/chips/centaur/procedures/hwp/memory
prd_incpath += ${ROOTPATH}/src/import/chips/centaur/procedures/hwp/memory/lib/shared
OpenPOWER on IntegriCloud