summaryrefslogtreecommitdiffstats
path: root/src/occ_405/proc
diff options
context:
space:
mode:
authormbroyles <mbroyles@us.ibm.com>2017-11-01 08:44:03 -0500
committerChristopher J. Cain <cjcain@us.ibm.com>2017-11-03 15:16:58 -0400
commitbb703e413c363255a578ffdb35e954a5b71de4c9 (patch)
tree7c804d5d9b1b6853b832526cefdfa391e1bf3e48 /src/occ_405/proc
parentd90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (diff)
downloadtalos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.tar.gz
talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.zip
Prevent logging 2A11 when quad is offline
Change-Id: Ic8cda9a6b8b311057ba2c4f0d9dc7e228e700c27 CQ: SW404469 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/49093 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Diffstat (limited to 'src/occ_405/proc')
-rwxr-xr-xsrc/occ_405/proc/proc_data.c37
-rwxr-xr-xsrc/occ_405/proc/proc_data.h21
2 files changed, 45 insertions, 13 deletions
diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c
index 6f25118..e8e9eba 100755
--- a/src/occ_405/proc/proc_data.c
+++ b/src/occ_405/proc/proc_data.c
@@ -86,6 +86,9 @@ uint32_t G_updated_core_mask = 0;
// without error.
uint32_t G_empath_error_core_mask = 0;
+//AMEC needs to know cores that are offline
+uint32_t G_core_offline_mask = 0;
+
//Global G_present_cores is bitmask of all cores
//(1 = present, 0 = not present. Core 0 has the most significant bit)
uint32_t G_present_cores = 0;
@@ -128,6 +131,7 @@ void task_core_data( task_t * i_task )
CoreData * l_temp = NULL; // Used for pointer swapping
bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr;
ipc_core_data_parms_t * l_parms = (ipc_core_data_parms_t*)(l_bulk_core_data_ptr->gpe_req.cmd_data);
+ static uint32_t L_trace_core_failure = 0;
do
{
@@ -160,8 +164,8 @@ void task_core_data( task_t * i_task )
//A request is not considered complete until both the engine job
//has finished without error and any callback has run to completion.
- if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request)
- &&
+ if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) &&
+ (l_parms->error.rc == 0) &&
CORE_PRESENT(l_bulk_core_data_ptr->current_core) )
{
//If the previous GPE request succeeded then swap core_data_ptr
@@ -184,7 +188,7 @@ void task_core_data( task_t * i_task )
//Core data has been collected so set the bit in global mask.
//AMEC code will know which cores to update sensors for. AMEC is
//responsible for clearing the bit later on.
- G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core);
+ G_updated_core_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
// Presumptively clear the empath error mask
G_empath_error_core_mask &=
@@ -197,6 +201,24 @@ void task_core_data( task_t * i_task )
{
G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = &G_core_data[MAX_NUM_FW_CORES+NUM_CORE_DATA_DOUBLE_BUF+NUM_CORE_DATA_EMPTY_BUF-1];
}
+ else if(l_parms->error.rc != 0)
+ {
+ // Check if failure is due to being offline (in stop 2 or greater)
+ if(l_parms->error.ffdc == PCB_ERROR_CHIPLET_OFFLINE)
+ {
+ // Mark core offline so it is ignored in control loops and to avoid health monitor logging error
+ G_core_offline_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
+ }
+ else if( !(L_trace_core_failure & (1 << l_bulk_core_data_ptr->current_core)) )
+ {
+ // trace error, if it continues health monitor will see and log error
+ INTR_TRAC_ERR("task_core_data: core %d data collection failed RC[0x%08X] FFDC[0x%08X%08X]",
+ l_bulk_core_data_ptr->current_core, l_parms->error.rc,
+ (uint32_t)(l_parms->error.ffdc >> 32),
+ (uint32_t)l_parms->error.ffdc);
+ L_trace_core_failure |= (1 << l_bulk_core_data_ptr->current_core);
+ }
+ }
//Update current core
if ( l_bulk_core_data_ptr->current_core >= l_bulk_core_data_ptr->end_core )
@@ -217,6 +239,8 @@ void task_core_data( task_t * i_task )
//1. Setup the get core data parms
l_parms->core_num = l_bulk_core_data_ptr->current_core;
l_parms->data = (CoreData*) l_bulk_core_data_ptr->core_data_ptr;
+ l_parms->error.error = 0; // default no error
+ l_parms->error.ffdc = 0;
// Static array to record the last timestamp a get_per_core_data task was
// scheduled for a core.
@@ -243,13 +267,6 @@ void task_core_data( task_t * i_task )
{
G_get_per_core_data_max_schedule_intervals[l_current_core] = l_elapsed_us;
}
- // Also sniff if the request has actually completed, it is checked above but
- // the schedule proceeds regardless which could be dangerous...
- if (!async_request_completed(&l_bulk_core_data_ptr->gpe_req.request))
- {
- INTR_TRAC_ERR("Async get_per_core_data task for core=%d not complete!",
- l_current_core);
- }
}
//2. Schedule the GPE Request to get the core data
diff --git a/src/occ_405/proc/proc_data.h b/src/occ_405/proc/proc_data.h
index ebef6d0..f8b7870 100755
--- a/src/occ_405/proc/proc_data.h
+++ b/src/occ_405/proc/proc_data.h
@@ -50,9 +50,10 @@
#define NUM_CORE_DATA_DOUBLE_BUF 2
#define NUM_CORE_DATA_EMPTY_BUF 1
-#define LO_CORES_MASK 0xfff00000
-#define HI_CORES_MASK 0x000fff00
-#define HW_CORES_MASK 0xffffff00
+#define LO_CORES_MASK 0xfff00000
+#define HI_CORES_MASK 0x000fff00
+#define HW_CORES_MASK 0xffffff00
+#define QUAD0_CORES_PRESENT_MASK 0xf0000000
enum eOccProcCores
{
@@ -103,6 +104,9 @@ extern uint32_t G_present_cores;
//AMEC needs to know when data for a core has been collected.
extern uint32_t G_updated_core_mask;
+//AMEC needs to know when a core is offline
+extern uint32_t G_core_offline_mask;
+
// External reference to empath error mask
extern uint32_t G_empath_error_core_mask;
@@ -120,6 +124,17 @@ extern bool G_nest_dts_data_valid;
#define CORE_EMPATH_ERROR(occ_core_id) \
((CORE0_PRESENT_MASK >> occ_core_id) & G_empath_error_core_mask)
+// Evaluates to true if the specified core is offline
+#define CORE_OFFLINE(occ_core_id) \
+ ((CORE0_PRESENT_MASK >> occ_core_id) & G_core_offline_mask)
+
+#define CLEAR_CORE_OFFLINE(occ_core_id) \
+ G_core_offline_mask &= ~(CORE0_PRESENT_MASK >> occ_core_id)
+
+// Evaluates to true if the specified quad has at least 1 active present core
+#define QUAD_ONLINE(occ_quad_id) \
+ ( (QUAD0_CORES_PRESENT_MASK >> (occ_quad_id*4)) & ((~G_core_offline_mask) & G_present_cores) )
+
//Collect bulk core data for all cores in specified range
void task_core_data( task_t * i_task );
OpenPOWER on IntegriCloud