From 61cd385caa634b5d8d63d3a21138c25230227d89 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Mon, 9 Apr 2018 11:49:10 -0500 Subject: OCC Centaur: Check for channel checkstop Change-Id: I2df9675d655b0391b249e49f7fc036788268e36c RTC: 191164 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57280 Tested-by: FSP CI Jenkins Reviewed-by: William A. Bryan Reviewed-by: Andres A. Lugo-Reyes Reviewed-by: Martha Broyles --- src/occ_405/cent/centaur_control.c | 9 ++++++-- src/occ_405/cent/centaur_data.c | 45 +++++++++++++++++++++++++++++--------- src/occ_405/cent/centaur_data.h | 6 ++++- 3 files changed, 47 insertions(+), 13 deletions(-) (limited to 'src/occ_405') diff --git a/src/occ_405/cent/centaur_control.c b/src/occ_405/cent/centaur_control.c index e5f37ef..283a22b 100755 --- a/src/occ_405/cent/centaur_control.c +++ b/src/occ_405/cent/centaur_control.c @@ -491,9 +491,14 @@ bool check_centaur_checkstop(memory_control_task_t * i_memControlTask ) commitErrl(&l_err); - return TRUE; // a centaur channel checkstop error occured + return FALSE; // error was not a channel checkstop } - return FALSE; // No centaur channel checkstop errors + else + { + // Remove the centaur sensor and all dimm sensors behind it. + cent_chan_checkstop(cent); + } + return TRUE; // Centaur channel checkstop } diff --git a/src/occ_405/cent/centaur_data.c b/src/occ_405/cent/centaur_data.c index 902f2c7..158937f 100755 --- a/src/occ_405/cent/centaur_data.c +++ b/src/occ_405/cent/centaur_data.c @@ -192,9 +192,26 @@ uint8_t G_centaur_nest_lfir6 = 0; //number of SC polls to wait between i2c recovery attempts #define CENT_SC_MAX_INTERVAL 256 -//determine scom address of MCIFIR register for given Centaur n -#define MCS0_MCIFIR_N(n) \ - ( (n<4)? (MCS0_MCIFIR + ((MCS1_MCIFIR - MCS0_MCIFIR) * (n))) : (MCS4_MCIFIR + ((MCS5_MCIFIR - MCS4_MCIFIR) * (n-4))) ) +// There was a centaur channel checkstop, remove the centaur from the enabled bitmask. +void cent_chan_checkstop(uint32_t i_cent) +{ + if(CENTAUR_PRESENT(i_cent)) + { + //remove checkstopped centaur from presence bitmap + G_present_centaurs &= ~(CENTAUR_BY_MASK(i_cent)); + + // remove the dimm temperature sensors behind this centaur + G_dimm_enabled_sensors.bytes[i_cent] = 0; + + TRAC_IMP("Channel checkstop detected on Centaur[%d] G_present_centaurs[0x%08X]", + i_cent, + G_present_centaurs); + + TRAC_IMP("Updated bitmap of enabled dimm temperature sensors: 0x%08X %08X", + G_dimm_enabled_sensors.words[0], + G_dimm_enabled_sensors.words[1]); + } +} void cent_recovery(uint32_t i_cent) { @@ -255,9 +272,13 @@ void cent_recovery(uint32_t i_cent) G_cent_scom_gpe_parms.error.rc) && (!(L_cent_callouts & l_cent_mask))) { - // Check if the centaur has a channel checkstop. If it does, then do not - // log any errors - if(G_cent_scom_gpe_parms.error.rc != CENTAUR_CHANNEL_CHECKSTOP) + // Check if the centaur has a channel checkstop. If it does then remove the centaur + // from the enabled sensor bit map and do not log any errors + if(G_cent_scom_gpe_parms.error.rc == CENTAUR_CHANNEL_CHECKSTOP) + { + cent_chan_checkstop(l_prev_cent); + } + else // Make error log for inband scom errors { //Mark the centaur as being called out L_cent_callouts |= l_cent_mask; @@ -641,9 +662,13 @@ void centaur_data( void ) //(as long as the request was scheduled). if(!async_request_completed(&l_centaur_data_ptr->gpe_req.request) || l_parms->error.rc ) { - // Check if the centaur has a channel checkstop. If it does, then do not - // log any errors - if(G_cent_scom_gpe_parms.error.rc != CENTAUR_CHANNEL_CHECKSTOP) + // Check if the centaur has a channel checkstop. If it does then do not + // log any errors, but remove the centaur from the config + if(l_parms->error.rc == CENTAUR_CHANNEL_CHECKSTOP) + { + cent_chan_checkstop(l_centaur_data_ptr->prev_centaur); + } + else // log the error if it was not a CENTAUR_CHANNEL_CHECKSTOP { //log an error the first time this happens but keep on running. //eventually, we will timeout on the dimm & centaur temps not being updated @@ -868,7 +893,7 @@ void centaur_data( void ) } while(0); - //handle centaur i2c recovery requests and centaur workaround - Needed for P9?? + //handle centaur i2c recovery requests and centaur workaround. if(CENTAUR_PRESENT(l_centaur_data_ptr->current_centaur)) { cent_recovery(l_centaur_data_ptr->current_centaur); diff --git a/src/occ_405/cent/centaur_data.h b/src/occ_405/cent/centaur_data.h index 0208265..30c0e0d 100755 --- a/src/occ_405/cent/centaur_data.h +++ b/src/occ_405/cent/centaur_data.h @@ -176,7 +176,11 @@ void cent_recovery(uint32_t i_cent); //associated with the specified OCC centaur id. CentaurMemData * cent_get_centaur_data_ptr( const uint8_t i_centaur_id ); - +// Create the centaur configuration object uint32_t centaur_configuration_create( CentaurConfiguration_t * i_centaurConfiguration ); + +// Remove centaur from enabled sensor list due to channel checkstop +void cent_chan_checkstop(uint32_t i_cent); + #endif //_CENTAUR_DATA_H -- cgit v1.2.1