diff options
author | Yue Du <daviddu@us.ibm.com> | 2017-09-06 10:58:50 -0500 |
---|---|---|
committer | Joshua Hunsberger <jahunsbe@us.ibm.com> | 2017-10-23 19:20:11 -0500 |
commit | 0be08fe5de5f5924f6665282153774d7605492cf (patch) | |
tree | ae75b37db00735ef794e86c0d4abfd4548677d3a /import | |
parent | e467315a84939ffaa32f3cf452364b017caad480 (diff) | |
download | talos-hcode-0be08fe5de5f5924f6665282153774d7605492cf.tar.gz talos-hcode-0be08fe5de5f5924f6665282153774d7605492cf.zip |
STOP: Core livelock buster
Using the FIT timer, periodically quiesce both cores to avoid
a livelock between two active cores
1) Mask SCOM RC=4 on the write to direct controls to stop the core
2) If core doesn't quiesce, abort and restart the core
3) Use the 32ns timebase to abort quiesce attempt after 200us
4) Add quiesce abort count to CME_Record
5) Make abort count a #define
Change-Id: Idd50c7535bf343d7a8c1b7fb2ba0374349df5082
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45708
Reviewed-by: Michael S. Floyd <mfloyd@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Juan R. Medina <jrmedina@us.ibm.com>
Reviewed-by: Gregory S. Still <stillgs@us.ibm.com>
Diffstat (limited to 'import')
5 files changed, 62 insertions, 28 deletions
diff --git a/import/chips/p9/procedures/ppe_closed/cme/p9_cme.h b/import/chips/p9/procedures/ppe_closed/cme/p9_cme.h index 33172533..99ed7611 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/p9_cme.h +++ b/import/chips/p9/procedures/ppe_closed/cme/p9_cme.h @@ -37,6 +37,7 @@ typedef struct uint32_t core_quiesce_time_latest; uint32_t core_quiesce_time_max; uint32_t core_quiesce_time_min; + uint32_t core_quiesce_failed_count; } CmeFitRecord; #endif diff --git a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c index aca79d80..41db59f7 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c +++ b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c @@ -33,7 +33,7 @@ #ifdef PCQW_ENABLE -CmeRecord G_cme_record = {0, {0, 0, 0, 0, 0xFFFFFFFF}}; +CmeRecord G_cme_record = {0, {0, 0, 0, 0, 0xFFFFFFFF, 0}}; #else @@ -73,8 +73,9 @@ void periodic_core_quiesce_workaround() uint32_t time_stamp[2]; data64_t scom_data; uint32_t sample_error = 0; + uint32_t saved_msr = 0; - PK_TRACE_INF("FIT: Periodic Core Quiesce Workaround"); + PK_TRACE("FIT: Periodic Core Quiesce Workaround"); CME_GETSCOM_AND(CPPM_CPMMR, CME_MASK_BC, scom_data.value); fused_core_mode = scom_data.words.upper & BIT32(9); @@ -141,7 +142,19 @@ void periodic_core_quiesce_workaround() #endif - CME_PUTSCOM_NOP(DIRECT_CONTROLS, core, scom_data.value); + // The SCOM can be delayed by traffic on PC on the SPR bus, so it is possible + // to get a RC=4 (Address Error), which really indicates a timeout. Need to mask + // this return code and retry until we get a clean return code + saved_msr = mfmsr(); + mtmsr( saved_msr | MSR_SEM4); // Mask off timeout + + do + { + CME_PUTSCOM_NOP(DIRECT_CONTROLS, core, scom_data.value); + } + while ((mfmsr() & MSR_SIBRC) != 0); + + mtmsr(saved_msr); #if NIMBUS_DD_LEVEL == 20 || DISABLE_CME_DUAL_CAST == 1 @@ -168,15 +181,36 @@ void periodic_core_quiesce_workaround() #define THREAD_VECTOR_CHECK (THREAD_VECTOR>>1 | THREAD_VECTOR>>3) +// In a future release of this patch, it should be based on the Nest Frequency, but +// plumbing for that sill needs to be created. +// 200us in 32ns timer ticks +#define QUIESCE_ABORT_TICKS 0x186A + + // Poll on THREAD_QUIESCE, LSU_QUIESCE, and NEST_ACTIVE. + // If they do not quiesce in 200us abort the patch and restart the cores. do { CME_GETSCOM_AND(RAS_STATUS, core, scom_data.value); - } - while((((scom_data.words.upper& THREAD_VECTOR_CHECK) != THREAD_VECTOR_CHECK) || //THREAD_ and LSU_QUIESCE must be ones + time_stamp[1] = in32(CME_LCL_TBR); + + if (time_stamp[1] > time_stamp[0]) + { + G_cme_record.fit_record.core_quiesce_time_latest = + time_stamp[1] - time_stamp[0]; + } + else + { + G_cme_record.fit_record.core_quiesce_time_latest = + 0xFFFFFFFF - time_stamp[0] + time_stamp[1] + 1; + } + } + while((((scom_data.words.upper& THREAD_VECTOR_CHECK) != THREAD_VECTOR_CHECK) + || //THREAD_ and LSU_QUIESCE must be ones ((scom_data.words.lower& BIT64SH(32)))) // NEST_ACTIVE must be zero && !(sample_error = bad_error_present) + && (G_cme_record.fit_record.core_quiesce_time_latest < QUIESCE_ABORT_TICKS) // 200us in 32ns timer ticks ); #if NIMBUS_DD_LEVEL == 20 || DISABLE_CME_DUAL_CAST == 1 @@ -185,15 +219,15 @@ void periodic_core_quiesce_workaround() #endif - time_stamp[1] = in32(CME_LCL_TBR); - - if (!sample_error) + if (!sample_error && (G_cme_record.fit_record.core_quiesce_time_latest < QUIESCE_ABORT_TICKS) ) { PK_TRACE("FIT: Both Cores Quiesced"); } else { - PK_TRACE_INF("FIT: Error while trying to Quiesce Cores"); + PK_TRACE_INF("FIT: Error while trying to Quiesce Cores. Bad Error %d, QuiesceTime (ns) %d", sample_error, + (G_cme_record.fit_record.core_quiesce_time_latest << 5)); + G_cme_record.fit_record.core_quiesce_failed_count++; } @@ -245,6 +279,7 @@ void periodic_core_quiesce_workaround() scom_data.words.upper = (THREAD_VECTOR & (~maint_mode[core & 1]) & (~spattn[core & 1])) >> 3; CME_PUTSCOM_NOP(DIRECT_CONTROLS, core, scom_data.value); + } PK_TRACE("FIT: Both Cores Started"); @@ -271,16 +306,7 @@ void periodic_core_quiesce_workaround() //Profile time - if (time_stamp[1] > time_stamp[0]) - { - G_cme_record.fit_record.core_quiesce_time_latest = - time_stamp[1] - time_stamp[0]; - } - else - { - G_cme_record.fit_record.core_quiesce_time_latest = - 0xFFFFFFFF - time_stamp[0] + time_stamp[1] + 1; - } + // timestamp delta was computed above to handle the abort case if (G_cme_record.fit_record.core_quiesce_time_latest < G_cme_record.fit_record.core_quiesce_time_min) @@ -305,7 +331,7 @@ void fit_handler() #ifdef PCQW_ENABLE - uint32_t core_quiesce_cpmmr_disable = 0; + uint32_t core_quiesce_cpmmr_disable; uint32_t core; uint32_t scom_op; data64_t scom_data; @@ -327,11 +353,7 @@ void fit_handler() #endif CME_GETSCOM_OP(CPPM_CPMMR, core, scom_op, scom_data.value); - - if (scom_data.words.upper & BIT32(2)) - { - core_quiesce_cpmmr_disable = 1; - } + core_quiesce_cpmmr_disable = scom_data.words.upper & BIT32(2); #if NIMBUS_DD_LEVEL == 20 || DISABLE_CME_DUAL_CAST == 1 @@ -357,7 +379,7 @@ void fit_handler() // 4) both core doesnt have cpmmr[2] asserted // 5) no bad error occurs if((G_cme_record.core_enabled == CME_MASK_BC) && - (G_cme_stop_record.core_running == CME_MASK_BC) && + ((in32_sh(CME_LCL_SISR) & BITS64SH(46, 2)) == BITS64SH(46, 2)) && (!(in32(CME_LCL_SISR) & BITS32(16, 2))) && (!core_quiesce_cpmmr_disable) && (!bad_error_present)) diff --git a/import/chips/p9/procedures/ppe_closed/cme/pk_app_cfg.h b/import/chips/p9/procedures/ppe_closed/cme/pk_app_cfg.h index 1aaf752b..feb15e3c 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/pk_app_cfg.h +++ b/import/chips/p9/procedures/ppe_closed/cme/pk_app_cfg.h @@ -65,7 +65,7 @@ // -------------------- // override swtich for NDD20/21/CDD10 workaround -#define DISABLE_PERIODIC_CORE_QUIESCE 1 +#define DISABLE_PERIODIC_CORE_QUIESCE 0 #if !DISABLE_PERIODIC_CORE_QUIESCE && (NIMBUS_DD_LEVEL == 20 || NIMBUS_DD_LEVEL == 21 || CUMULUS_DD_LEVEL == 10) #define PCQW_ENABLE @@ -100,7 +100,7 @@ #define HW405292_NDD1_PCBMUX_SAVIOR 1 #define RUN_NDD1_ABIST_IN_PARALLEL_MODE 1 #define MASK_MSR_SEM6 - #define USE_CME_VDM_FEATURE + #undef USE_CME_VDM_FEATURE #undef USE_CME_RESCLK_FEATURE #endif diff --git a/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop.h b/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop.h index e35ee298..2a8cac46 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop.h +++ b/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop.h @@ -84,6 +84,9 @@ #endif /// handcoded addresses TO BE REMOVED + +#define PSCOM_MODE_REG 0x20010000 + #define CORE_FIRMASK 0x20010A43 #define CORE_ACTION0 0x20010A46 #define CORE_ACTION1 0x20010A47 diff --git a/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop_entry.c b/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop_entry.c index 2b7dced3..d9023556 100755 --- a/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop_entry.c +++ b/import/chips/p9/procedures/ppe_closed/cme/stop_cme/p9_cme_stop_entry.c @@ -555,6 +555,8 @@ p9_cme_stop_entry() // Permanent workaround for HW407385 + wrteei(0); + PK_TRACE("HW407385: Assert block interrupt to PC via SICR[2/3]"); out32(CME_LCL_SICR_OR, core << SHIFT32(3)); @@ -567,6 +569,8 @@ p9_cme_stop_entry() while((in32(CME_LCL_EINR)) & (core << SHIFT32(21))); + wrteei(1); + // end of HW407385 #if HW402407_NDD1_TLBIE_STOP_WORKAROUND @@ -710,6 +714,8 @@ p9_cme_stop_entry() // Permanent workaround for HW407385 + wrteei(0); + PK_TRACE("HW407385: Drop pm_exit via SICR[4/5]"); out32(CME_LCL_SICR_CLR, core << SHIFT32(5)); @@ -723,6 +729,8 @@ p9_cme_stop_entry() PK_TRACE("HW407385: Drop block interrupt to PC via SICR[2/3]"); out32(CME_LCL_SICR_CLR, core << SHIFT32(3)); + wrteei(1); + // end of HW407385 //========================== |