From 1096b337782a74a81b1ed1660a8731da01ebfa68 Mon Sep 17 00:00:00 2001 From: Rahul Batra Date: Mon, 29 Jul 2019 13:13:39 -0400 Subject: PM: Fix DB0 Hang Key_Cronus_Test=PM_REGRESS Change-Id: I706ec7b87e777b736153d5765ced0a3f6cea5d96 CQ: SW470688 Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/81266 Tested-by: Jenkins Server Tested-by: PPE CI Tested-by: Cronus HW CI Tested-by: Hostboot CI Tested-by: FSP CI Jenkins Tested-by: HWSV CI Reviewed-by: YUE DU Reviewed-by: RANGANATHPRASAD G. BRAHMASAMUDRA Reviewed-by: Jennifer A Stofer --- .../chips/p9/procedures/hwp/lib/p9_pm_hcd_flags.h | 1 + import/chips/p9/procedures/ppe/iota/iota_uih.c | 81 ++++++++++++++++++++-- .../procedures/ppe_closed/cme/p9_cme_iota_main.c | 6 +- .../p9/procedures/ppe_closed/cme/p9_cme_irq.h | 15 +++- .../ppe_closed/cme/pstate_cme/p9_cme_intercme.c | 8 ++- .../ppe_closed/cme/pstate_cme/p9_cme_thread_db.c | 75 +++++++++++++++++++- .../ppe_closed/pgpe/pstate_gpe/p9_pgpe_pstate.c | 2 +- 7 files changed, 175 insertions(+), 13 deletions(-) (limited to 'import') diff --git a/import/chips/p9/procedures/hwp/lib/p9_pm_hcd_flags.h b/import/chips/p9/procedures/hwp/lib/p9_pm_hcd_flags.h index fcc74dbd..1b940919 100644 --- a/import/chips/p9/procedures/hwp/lib/p9_pm_hcd_flags.h +++ b/import/chips/p9/procedures/hwp/lib/p9_pm_hcd_flags.h @@ -129,6 +129,7 @@ enum PM_CME_FLAGS_DEFS CME_FLAGS_DROOP_SUSPEND_ENTRY = 14, CME_FLAGS_SAFE_MODE = 16, CME_FLAGS_PSTATES_SUSPENDED = 17, + CME_FLAGS_DB0_COMM_RECV_STARVATION_CNT_ENABLED = 18, CME_FLAGS_SPWU_CHECK_ENABLE = 22, CME_FLAGS_BLOCK_ENTRY_STOP11 = 23, CME_FLAGS_PSTATES_ENABLED = 24, diff --git a/import/chips/p9/procedures/ppe/iota/iota_uih.c b/import/chips/p9/procedures/ppe/iota/iota_uih.c index 546def67..28272661 100644 --- a/import/chips/p9/procedures/ppe/iota/iota_uih.c +++ b/import/chips/p9/procedures/ppe/iota/iota_uih.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HCODE Project */ /* */ -/* COPYRIGHT 2017 */ +/* COPYRIGHT 2017,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -32,6 +32,9 @@ int g_eimr_stack_ctr = -1; uint64_t g_eimr_override_stack[IOTA_NUM_EXT_IRQ_PRIORITIES]; uint64_t g_eimr_override = 0x0000000000000000; uint64_t g_ext_irq_vector = 0; +uint32_t g_db0_pending_fit_tick_count = 0; +uint32_t g_comm_recv_pending_fit_tick_count = 0; +uint32_t g_intercme_in0_pending_tick_count = 0; // Unified IRQ priority and masking handler. // - Locates the highest priority IRQ task vector that has at least one of its @@ -51,13 +54,50 @@ uint32_t iota_uih(void) do { - if(ext_irq_vectors_cme[iPrtyLvl][IDX_PRTY_VEC] & g_ext_irq_vector) + //Note: Special handling of DB0/COMM_RECV to handle the db0/comm_recv + //starvation case. + // + //Reason: DB0(Quad Manager CME) and COMM_RECV(Sibling CME) are lower priority + //than the STOP related interrupts, + //and can stay pending for very long time(~ms scale) on systems with + //high frequency of STOP requests. This can then prevent PGPE from + //completing OCC directed IPC operations within the expected + //time bounds(< 8ms) + // + //Mechanism: + //1)In FIT: Every FIT tick, we check if DB0(on Quad manager)/COMM_RECV(on Sibling CME) + //is pending. If DB0(on Quad manager)/COMM_RECV(on Sibling CME) is seen pending for + //more than DB0_FIT_TICK_THRESHOLD/COMM_RECV_FIT_TICK_THRESHOLD FIT ticks, + //then we take action in UIH + // + //2)In UIH: We set priority level to IDX_PRTY_LVL_DB0/IDX_PRTY_LVL_COMM_RECVD, and mask + //everything except Priority 0(xstop, exceptions, etc). This then allows a + //pending DB0 to complete + if(g_db0_pending_fit_tick_count > DB0_FIT_TICK_THRESHOLD) + { + bFound = 1; + iPrtyLvl = IDX_PRTY_LVL_DB0; + break; + } + else if(g_comm_recv_pending_fit_tick_count > COMM_RECV_FIT_TICK_THRESHOLD) + { + bFound = 1; + iPrtyLvl = IDX_PRTY_LVL_COMM_RECVD; + break; + } + else if(g_intercme_in0_pending_tick_count > INTERCME_IN0_FIT_TICK_THRESHOLD) + { + bFound = 1; + iPrtyLvl = IDX_PRTY_LVL_INTERCME_IN0; + break; + } + else if(ext_irq_vectors_cme[iPrtyLvl][IDX_PRTY_VEC] & g_ext_irq_vector) { bFound = 1; break; } } - while(++iPrtyLvl < (IOTA_NUM_EXT_IRQ_PRIORITIES - 1)); //No need to check DISABLED. + while(++iPrtyLvl < (IOTA_NUM_EXT_IRQ_PRIORITIES - 1)); //No need to check DISABLED. // Only manipulate EIMR masks for task level prty levels. // Let shared non-task IRQs (iPrtyLvl=0) be processed by @@ -82,9 +122,38 @@ uint32_t iota_uih(void) } // 3. Write the new mask for this priority level. - out64(CME_LCL_EIMR, ext_irq_vectors_cme[iPrtyLvl][IDX_MASK_VEC] | - g_eimr_override); - + //Note: Special handling of DB0/COMM_RECV to handle the db0/comm_recv + //starvation case. + // + //Reason: DB0(Quad Manager CME) and COMM_RECV(Sibling CME) are lower priority + //than the STOP related interrupts, + //and can stay pending for very long time(~ms scale) on systems with + //high frequency of STOP requests. This can then prevent PGPE from + //completing OCC directed IPC operations within the expected + //time bounds(< 8ms) + // + //Mechanism: + //1)In FIT: Every FIT tick, we check if DB0(on Quad manager)/COMM_RECV(on Sibling CME) + //is pending. If DB0(on Quad manager)/COMM_RECV(on Sibling CME) is seen pending for + //more than DB0_FIT_TICK_THRESHOLD/COMM_RECV_FIT_TICK_THRESHOLD FIT ticks, + //then we take action in UIH + // + //2)In UIH: We set priority level to IDX_PRTY_LVL_DB0/IDX_PRTY_LVL_COMM_RECVD, and mask + //everything except Priority 0(xstop, exceptions, etc). This then allows a + //pending DB0 to complete + if ((g_db0_pending_fit_tick_count > DB0_FIT_TICK_THRESHOLD) || + (g_comm_recv_pending_fit_tick_count > COMM_RECV_FIT_TICK_THRESHOLD) || + (g_intercme_in0_pending_tick_count > INTERCME_IN0_FIT_TICK_THRESHOLD)) + { + PK_TRACE_INF("UIH: Starvation Detected. Overriding Mask!"); + out64(CME_LCL_EIMR, (ext_irq_vectors_cme[0][IDX_MASK_VEC] | + g_eimr_override)); + } + else + { + out64(CME_LCL_EIMR, ext_irq_vectors_cme[iPrtyLvl][IDX_MASK_VEC] | + g_eimr_override); + } } else { diff --git a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c index f9632f45..0818517e 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c +++ b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_iota_main.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HCODE Project */ /* */ -/* COPYRIGHT 2017,2018 */ +/* COPYRIGHT 2017,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -46,6 +46,8 @@ CmeFitRecord G_cme_fit_record = {0, 0, 0, 0, 0xFFFFFFFF, 0}; #endif +void p9_cme_pstate_db0_comm_recv_intercme_in0_pending_counter(); + uint32_t G_CME_LCL_EINR = CME_LCL_EINR; uint32_t G_CME_LCL_EISR = CME_LCL_EISR; uint32_t G_CME_LCL_EISR_CLR = CME_LCL_EISR_CLR; @@ -102,6 +104,8 @@ void fit_handler() p9_cme_core_livelock_buster(); #endif + //Handle DB0/Comm_Recv starvation case + p9_cme_pstate_db0_comm_recv_intercme_in0_pending_counter(); } #endif //fit handler diff --git a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_irq.h b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_irq.h index c5a29801..170eb88f 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/p9_cme_irq.h +++ b/import/chips/p9/procedures/ppe_closed/cme/p9_cme_irq.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HCODE Project */ /* */ -/* COPYRIGHT 2015,2018 */ +/* COPYRIGHT 2015,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -48,6 +48,19 @@ #include +//CME_TSEL is set to 8 which means FIT has period of 1.04ms when +//Nest Freq is 2000Mhz. Ideally, should calculate period of FIT based +//on nest frequency, but nest frequency is NOT plumbed to CME and we +//don't need to be highly accurate here. +//Note, from PGPE perspective, the latency of the DB0 operation depends +//on the amount of time DB0 is pending on Quad Manager plus COMM_RECV is pending +//on sibling. This is because COMM_RECV interrupt is triggered by the DB0 +//handler on the quad manager. Therefore, we must set the COMM_RECV_TICK_THRESHOLD +//to be smaller. +#define DB0_FIT_TICK_THRESHOLD 1 //Threshold for DB0 pending count(2ms) +#define COMM_RECV_FIT_TICK_THRESHOLD 1 //Threshold for COMM_RECV pending countr(2ms) +#define INTERCME_IN0_FIT_TICK_THRESHOLD 1 //Threshold for COMM_RECV pending countr(2ms) + // Priority Levels #define IDX_PRTY_LVL_HIPRTY 0 #define IDX_PRTY_LVL_DB3 1 diff --git a/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_intercme.c b/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_intercme.c index e802448e..9497442c 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_intercme.c +++ b/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_intercme.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HCODE Project */ /* */ -/* COPYRIGHT 2016,2018 */ +/* COPYRIGHT 2016,2019 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -43,17 +43,21 @@ // extern CmePstateRecord G_cme_pstate_record; extern CmeRecord G_cme_record; +extern uint32_t g_comm_recv_pending_fit_tick_count; +extern uint32_t g_intercme_in0_pending_tick_count; // //InterCME_IN0 handler // void p9_cme_pstate_intercme_in0_irq_handler(void) { + g_intercme_in0_pending_tick_count = 0; p9_cme_pstate_process_db0_sibling(); } void p9_cme_pstate_intercme_msg_handler(void) { + g_comm_recv_pending_fit_tick_count = 0; p9_cme_pstate_sibling_lock_and_intercme_protocol(INTERCME_MSG_LOCK_WAIT_ON_RECV); } @@ -119,6 +123,8 @@ void p9_cme_pstate_process_db0_sibling() //Unmask EIMR[OCC_HEARTBEAT_LOST/4] g_eimr_override &= ~BIT64(4); + out32(G_CME_LCL_FLAGS_OR, BIT32(CME_FLAGS_DB0_COMM_RECV_STARVATION_CNT_ENABLED));//Set Starvation Count enabled + //Clear Core GPMMR RESET_STATE_INDICATOR bit to show pstates have started CME_PUTSCOM(PPM_GPMMR_CLR, G_cme_record.core_enabled, BIT64(15)); } diff --git a/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_thread_db.c b/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_thread_db.c index 0c3f8cde..cba9f9e8 100644 --- a/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_thread_db.c +++ b/import/chips/p9/procedures/ppe_closed/cme/pstate_cme/p9_cme_thread_db.c @@ -56,6 +56,9 @@ extern CmePstateRecord G_cme_pstate_record; extern cmeHeader_t* G_cmeHeader; extern LocalPstateParmBlock* G_lppb; extern uint8_t G_vdm_threshold_table[]; +extern uint32_t g_db0_pending_fit_tick_count; +extern uint32_t g_comm_recv_pending_fit_tick_count; +extern uint32_t g_intercme_in0_pending_tick_count; cppm_cmedb0_t G_dbData; @@ -75,6 +78,7 @@ void p9_cme_pstate_db0_start(); void p9_cme_pstate_db0_glb_bcast(); void p9_cme_pstate_db0_clip_bcast(); void p9_cme_pstate_update(); +void p9_cme_pstate_db0_comm_recv_intercme_in0_pending_counter(); // //Doorbell0 interrupt handler @@ -166,6 +170,67 @@ void p9_cme_pstate_db0_handler(void) g_eimr_override &= ~BIT64(4); } +// +//Doorbell0/Comm Recv pending counter(called every FIT tick) +// +void p9_cme_pstate_db0_comm_recv_intercme_in0_pending_counter() +{ + //Note: Special handling of DB0/COMM_RECV to handle the db0/comm_recv + //starvation case. + // + //Reason: DB0(Quad Manager CME) and COMM_RECV(Sibling CME) are lower priority + //than the STOP related interrupts, + //and can stay pending for very long time(~ms scale) on systems with + //high frequency of STOP requests. This can then prevent PGPE from + //completing OCC directed IPC operations within the expected + //time bounds(< 8ms) + // + //Mechanism: + //1)In FIT: Every FIT tick, we check if DB0(on Quad manager)/COMM_RECV(on Sibling CME) + //is pending. If DB0(on Quad manager)/COMM_RECV(on Sibling CME) is seen pending for + //more than DB0_FIT_TICK_THRESHOLD/COMM_RECV_FIT_TICK_THRESHOLD FIT ticks, + //then we take action in UIH + // + //2)In UIH: We set priority level to IDX_PRTY_LVL_DB0/IDX_PRTY_LVL_COMM_RECVD, and mask + //everything except Priority 0(xstop, exceptions, etc). This then allows a + //pending DB0 to complete + uint32_t cme_flags = in32(G_CME_LCL_FLAGS); + + if (cme_flags & BIT32(CME_FLAGS_DB0_COMM_RECV_STARVATION_CNT_ENABLED)) + { + if(G_cme_pstate_record.qmFlag) + { + + if (cme_flags & BIT32(CME_FLAGS_CORE0_GOOD)) + { + if (in32_sh(CME_LCL_EISR) & BIT64SH(36)) + { + g_db0_pending_fit_tick_count++; + } + } + else + { + if (in32_sh(CME_LCL_EISR) & BIT64SH(37)) + { + g_db0_pending_fit_tick_count++; + } + } + } + else + { + if (in32(CME_LCL_EISR) & BIT32(29)) + { + g_comm_recv_pending_fit_tick_count++; + } + + if(in32(CME_LCL_EISR) & BIT32(7)) + { + g_intercme_in0_pending_tick_count++; + } + } + } +} + // //Doorbell3 interrupt handler // @@ -660,6 +725,9 @@ void p9_cme_pstate_process_db0() G_cme_pstate_record.updateAnalogError = 0; uint64_t scom_data; + //Clear out db0_pending_tick_count + g_db0_pending_fit_tick_count = 0; + PK_TRACE_INF("PSTATE: Process DB0 Enter"); //Clear EISR and read DB0 register @@ -856,7 +924,7 @@ inline void p9_cme_pstate_register() } } - PK_TRACE_INF("PSTATE: Sib Register MsgCnt=%d", msgCnt); + PK_TRACE_DBG("PSTATE: Sib Register MsgCnt=%d", msgCnt); } } } @@ -894,6 +962,7 @@ void p9_cme_pstate_db0_start() ack = MSGID_PCB_TYPE4_ACK_PSTATE_PROTO_ACK; out32(G_CME_LCL_FLAGS_OR, BIT32(24));//Set Pstates Enabled + out32(G_CME_LCL_FLAGS_OR, BIT32(CME_FLAGS_DB0_COMM_RECV_STARVATION_CNT_ENABLED));//Set Starvation Count enabled //Enable PMCR Interrupts (for good cores) when this task is done g_eimr_override &= ~(uint64_t)(G_cme_record.core_enabled << SHIFT64(35)); @@ -1035,7 +1104,7 @@ inline void p9_cme_pstate_db0_pmsr_updt() //Set Core GPMMR RESET_STATE_INDICATOR bit to show pstates have stopped CME_PUTSCOM(PPM_GPMMR_OR, G_cme_record.core_enabled, BIT64(15)); - PK_TRACE_INF("PSTATE: DB0 Safe Mode Exit"); + PK_TRACE_INF("PSTATE: DB0 PMSR Updt Exit"); } void p9_cme_pstate_notify_sib(INTERCME_DIRECT_INTF intf) @@ -1058,7 +1127,7 @@ inline void p9_cme_pstate_freq_update(uint32_t cme_flags) else { PK_TRACE_INF("PSTATE: Freq Updt Enter"); - PK_TRACE_INF("PSTATE: Dpll0=0x%x", G_lppb->dpll_pstate0_value); + PK_TRACE_DBG("PSTATE: Dpll0=0x%x", G_lppb->dpll_pstate0_value); //Adjust DPLL qppm_dpll_freq_t dpllFreq; diff --git a/import/chips/p9/procedures/ppe_closed/pgpe/pstate_gpe/p9_pgpe_pstate.c b/import/chips/p9/procedures/ppe_closed/pgpe/pstate_gpe/p9_pgpe_pstate.c index 2ed5aa58..4d6348d9 100644 --- a/import/chips/p9/procedures/ppe_closed/pgpe/pstate_gpe/p9_pgpe_pstate.c +++ b/import/chips/p9/procedures/ppe_closed/pgpe/pstate_gpe/p9_pgpe_pstate.c @@ -2542,7 +2542,7 @@ void p9_pgpe_pstate_wov_init() G_pgpe_pstate_record.wov.avg_freq_gt_target_freq = 0; G_pgpe_pstate_record.wov.freq_loss_tenths_gt_max_droop_tenths = 0; G_pgpe_pstate_record.wov.status = WOV_DISABLED; - G_pgpe_pstate_record.wov.info = 0xdeadbeef; + G_pgpe_pstate_record.wov.info = 0xdeadde04; } // -- cgit v1.2.1