Prevent logging 2A11 when quad is offline

Change-Id: Ic8cda9a6b8b311057ba2c4f0d9dc7e228e700c27 CQ: SW404469 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/49093 Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com> Reviewed-by: William A. Bryan <wilbryan@us.ibm.com> Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com> Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
author: mbroyles <mbroyles@us.ibm.com> 2017-11-01 08:44:03 -0500
committer: Christopher J. Cain <cjcain@us.ibm.com> 2017-11-03 15:16:58 -0400
commit: bb703e413c363255a578ffdb35e954a5b71de4c9 (patch)
tree: 7c804d5d9b1b6853b832526cefdfa391e1bf3e48
parent: d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (diff)
download: talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.tar.gz
talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.zip
6 files changed, 181 insertions, 58 deletions
diff --git a/src/occ_405/amec/amec_dps.c b/src/occ_405/amec/amec_dps.c
index 6f5ff15..e41783a 100755
--- a/src/occ_405/amec/amec_dps.c
+++ b/src/occ_405/amec/amec_dps.c
@@ -84,7 +84,7 @@ void amec_dps_update_core_util(void)
         // Update moving average of util_slack and util_active for all cores
         for(l_idx=0; l_idx<MAX_NUM_CORES; l_idx++)
         {
-            if (!CORE_PRESENT(l_idx))
+            if (!CORE_PRESENT(l_idx) || CORE_OFFLINE(l_idx))
             {
                 continue; //nothing to do if the core's disabled
             }
@@ -351,7 +351,7 @@ void amec_dps_main(void)
     for (l_idx=0; l_idx<MAX_NUM_CORES; l_idx++)
     {
         // Find the first valid core and send its frequency
-        if (CORE_PRESENT(l_idx))
+        if (CORE_PRESENT(l_idx) && !CORE_OFFLINE(l_idx))
         {
             G_dcom_slv_outbox_tx.factual =
                 AMECSENSOR_ARRAY_PTR(FREQREQC0,l_idx)->sample;
diff --git a/src/occ_405/amec/amec_freq.c b/src/occ_405/amec/amec_freq.c
index 9150328..7b02ba7 100755
--- a/src/occ_405/amec/amec_freq.c
+++ b/src/occ_405/amec/amec_freq.c
@@ -387,7 +387,7 @@ void amec_slv_proc_voting_box(void)
 
     for (k=0; k<MAX_NUM_CORES; k++)
     {
-        if(CORE_PRESENT(k))
+        if( CORE_PRESENT(k) && !CORE_OFFLINE(k) )
         {
             l_core_freq = l_chip_fmax;
             l_core_reason = l_chip_reason;
@@ -556,11 +556,12 @@ void amec_slv_proc_voting_box(void)
             {
                 g_amec->proc[0].core_max_freq = l_core_freq;
             }
-        }
+        } // if core present and not offline
         else
         {
-            l_core_freq = 0;
-            l_core_reason = 0;
+            //Set f_request to 0 so this core is ignored in amec_slv_freq_smh()
+            g_amec->proc[0].core[k].f_request = 0;
+            g_amec->proc[0].core[k].f_reason = 0;
         }
     }//End of for loop
 
@@ -600,7 +601,8 @@ void amec_slv_freq_smh(void)
     uint8_t     core_idx = 0;   // loop through cores within each quad
     Pstate      pmax[MAXIMUM_QUADS] = {0}; // max pstate (min frequency) within each quad
     Pstate      pmax_chip = 0;  // highest Pstate (lowest frequency) across all quads
-    bool        l_atLeast1Core[MAXIMUM_QUADS] = {FALSE};  // at least 1 core present in quad
+    bool        l_atLeast1Core[MAXIMUM_QUADS] = {FALSE};  // at least 1 core present and online in quad
+    bool        l_atLeast1Quad = FALSE;    // at least 1 quad online
     static bool L_mfg_set_trace[MAXIMUM_QUADS] = {FALSE};
     static bool L_mfg_clear_trace[MAXIMUM_QUADS] = {FALSE};
 
@@ -619,6 +621,7 @@ void amec_slv_freq_smh(void)
             if(g_amec->proc[0].core[core_num].f_request != 0)
             {
                l_atLeast1Core[quad] = TRUE;
+               l_atLeast1Quad = TRUE;
                // The higher the pstate number, the lower the frequency
                if(pmax[quad] <  proc_freq2pstate(g_amec->proc[0].core[core_num].f_request))
                {
@@ -630,45 +633,49 @@ void amec_slv_freq_smh(void)
         }
     }
 
-    // check for mfg quad Pstate request and set Pstate for each quad
-    for (quad = 0; quad < MAXIMUM_QUADS; quad++)
+    // Skip determining new frequency if all cores in all quads are offline
+    if(l_atLeast1Quad)
     {
-        // set quad with no cores present to lowest frequency for the chip
-        if(l_atLeast1Core[quad] == FALSE)
-           pmax[quad] = pmax_chip;
-
-        // check if there is a mnfg Pstate request for this quad
-        if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF)
+        // check for mfg quad Pstate request and set Pstate for each quad
+        for (quad = 0; quad < MAXIMUM_QUADS; quad++)
         {
-           // use mnfg request if it is a lower frequency (higher pState)
-           if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad])
-              pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad];
+            // set quad with no cores present to lowest frequency for the chip
+            if(l_atLeast1Core[quad] == FALSE)
+               pmax[quad] = pmax_chip;
 
-           if(L_mfg_clear_trace[quad] == FALSE)
-              L_mfg_set_trace[quad] = TRUE;
-        }
-        else if(L_mfg_clear_trace[quad] == TRUE)
-        {
-            TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]);
-            L_mfg_clear_trace[quad] = FALSE;
-        }
+            // check if there is a mnfg Pstate request for this quad
+            if(g_amec->mnfg_parms.quad_pstate[quad] != 0xFF)
+            {
+               // use mnfg request if it is a lower frequency (higher pState)
+               if(g_amec->mnfg_parms.quad_pstate[quad] > pmax[quad])
+                  pmax[quad] = g_amec->mnfg_parms.quad_pstate[quad];
+
+               if(L_mfg_clear_trace[quad] == FALSE)
+                  L_mfg_set_trace[quad] = TRUE;
+            }
+            else if(L_mfg_clear_trace[quad] == TRUE)
+            {
+                TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request cleared. New Pstate = 0x%02x", quad, pmax[quad]);
+                L_mfg_clear_trace[quad] = FALSE;
+            }
 
 #ifdef PROC_DEBUG
-        if (G_desired_pstate[quad] != pmax[quad])
-        {
-            TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]);
-        }
+            if (G_desired_pstate[quad] != pmax[quad])
+            {
+                TRAC_IMP("Updating Quad %d's Pstate to %d", quad, pmax[quad]);
+            }
 #endif
-        // update quad pstate request
-        G_desired_pstate[quad] = pmax[quad];
+            // update quad pstate request
+            G_desired_pstate[quad] = pmax[quad];
 
-        if(L_mfg_set_trace[quad] == TRUE)
-        {
-            TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]);
-            L_mfg_set_trace[quad] = FALSE;
-            L_mfg_clear_trace[quad] = TRUE;
+            if(L_mfg_set_trace[quad] == TRUE)
+            {
+                TRAC_INFO("amec_slv_freq_smh: mfg Quad %d Pstate request set = 0x%02x", quad, pmax[quad]);
+                L_mfg_set_trace[quad] = FALSE;
+                L_mfg_clear_trace[quad] = TRUE;
+            }
         }
-    }
+    }  // if at least 1 core online
 }
 
 
diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c
index f0b2609..60d5a81 100755
--- a/src/occ_405/amec/amec_health.c
+++ b/src/occ_405/amec/amec_health.c
@@ -920,7 +920,7 @@ void amec_health_check_proc_timeout()
     {
         for(i=0; i<MAX_NUM_CORES; i++)
         {
-            if(!CORE_PRESENT(i))
+            if(!CORE_PRESENT(i) || CORE_OFFLINE(i))
             {
                 // If this core is not present, move on
                 continue;
diff --git a/src/occ_405/amec/amec_sensors_core.c b/src/occ_405/amec/amec_sensors_core.c
index d2eddec..b723366 100755
--- a/src/occ_405/amec/amec_sensors_core.c
+++ b/src/occ_405/amec/amec_sensors_core.c
@@ -74,9 +74,14 @@ void amec_calc_droop_sensors(CoreData * i_core_data_ptr, uint8_t i_core);
 void amec_update_proc_core_sensors(uint8_t i_core)
 {
   CoreData  *l_core_data_ptr;
-  uint16_t  l_temp16 = 0;
   uint32_t  l_temp32 = 0;
+  uint16_t  l_core_temp = 0;
+  uint16_t  l_temp16 = 0;
+  uint16_t  l_core_util = 0;
+  uint16_t  l_core_freq = 0;
+  uint16_t  l_time_interval = 0;
   uint8_t   i = 0;
+  uint8_t   l_quad = i_core / 4;     // Quad this core resides in
 
   // Make sure the core is present, and that it has updated data.
   if(CORE_PRESENT(i_core) && CORE_UPDATED(i_core))
@@ -95,8 +100,8 @@ void amec_update_proc_core_sensors(uint8_t i_core)
     //-------------------------------------------------------
     // Util / Freq
     //-------------------------------------------------------
-    // Skip this update if there was an empath collection error
-    if (!CORE_EMPATH_ERROR(i_core))
+    // Skip this update if there was an empath collection error or if previously offline
+    if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core))
     {
         amec_calc_freq_and_util_sensors(l_core_data_ptr,i_core);
     }
@@ -105,13 +110,16 @@ void amec_update_proc_core_sensors(uint8_t i_core)
     // Performance counter - This function should be called
     // after amec_calc_freq_and_util_sensors().
     //-------------------------------------------------------
-    amec_calc_dps_util_counters(i_core);
+    if(!CORE_OFFLINE(i_core))
+    {
+        amec_calc_dps_util_counters(i_core);
+    }
 
     //-------------------------------------------------------
     // IPS
     //-------------------------------------------------------
     // Skip this update if there was an empath collection error
-    if (!CORE_EMPATH_ERROR(i_core))
+    if (!CORE_EMPATH_ERROR(i_core) && !CORE_OFFLINE(i_core))
     {
         amec_calc_ips_sensors(l_core_data_ptr,i_core);
     }
@@ -162,7 +170,83 @@ void amec_update_proc_core_sensors(uint8_t i_core)
     l_temp16 = (uint16_t)(G_dcom_slv_inbox_doorbell_rx.tod>>45);
     // hi 3 bits in 0.796 day resolution with 512MHz TOD clock
     sensor_update( AMECSENSOR_PTR(TODclock2), l_temp16);
-  }
+
+    // Core must be online that it was updated and now that the sensors have been updated make sure
+    // the core offline bit is off for this core.  Clearing this prior to updating the temperature
+    // sensors may result in a false processor timeout error in health monitor
+    CLEAR_CORE_OFFLINE(i_core);
+  } // if core present and updated
+
+  else if(CORE_OFFLINE(i_core))
+  {
+    // core wasn't updated due to being offline, update sensors accordingly
+
+    // Determine "core" temperature that will be returned in the poll for fan control
+    // If there is at least 1 core online within the same quad use the quad temp else use the nest
+    if(QUAD_ONLINE(l_quad))
+    {
+       l_core_temp = AMECSENSOR_ARRAY_PTR(TEMPQ0, l_quad)->sample;
+    }
+    else
+    {
+       l_core_temp = getSensorByGsid(TEMPNEST)->sample;
+    }
+    if(l_core_temp)
+    {
+       sensor_update(AMECSENSOR_ARRAY_PTR(TEMPPROCTHRMC0,i_core), l_core_temp);
+    }
+
+    // Update utilization and frequency sensors to 0
+    sensor_update(AMECSENSOR_ARRAY_PTR(NUTILC0, i_core), 0);
+    sensor_update(AMECSENSOR_ARRAY_PTR(UTILC0, i_core), 0);
+    sensor_update(AMECSENSOR_ARRAY_PTR(IPSC0, i_core), 0);
+    sensor_update(AMECSENSOR_ARRAY_PTR(NOTBZEC0, i_core), 0);
+    sensor_update(AMECSENSOR_ARRAY_PTR(NOTFINC0, i_core), 0);
+    sensor_update(AMECSENSOR_ARRAY_PTR(FREQAC0, i_core), 0);
+    for(i=0; i<MAX_THREADS_PER_CORE; i++)
+    {
+      g_amec->proc[0].core[i_core].thread[i].util4ms_thread = 0;
+    }
+
+    // Make updates for rolling average
+    // Determine the time interval for the rolling average calculation
+    l_time_interval = AMEC_DPS_SAMPLING_RATE * AMEC_IPS_AVRG_INTERVAL;
+
+    // Increment sample count
+    if(g_amec->proc[0].core[i_core].sample_count < UINT16_MAX)
+    {
+       g_amec->proc[0].core[i_core].sample_count++;
+    }
+
+    if(g_amec->proc[0].core[i_core].sample_count == l_time_interval)
+    {
+        // Increase resolution of the UTIL accumulator by two decimal places
+        l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(UTILC0,i_core)->accumulator * 100;
+        // Calculate average utilization of this core
+        l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count;
+        g_amec->proc[0].core[i_core].avg_util = l_temp32;
+
+        // Increase resolution of the FREQA accumulator by two decimal places
+        l_temp32 = (uint32_t)AMECSENSOR_ARRAY_PTR(FREQAC0,i_core)->accumulator * 100;
+        // Calculate average frequency of this core
+        l_temp32 = l_temp32 / g_amec->proc[0].core[i_core].sample_count;
+        g_amec->proc[0].core[i_core].avg_freq = l_temp32;
+    }
+    else if(g_amec->proc[0].core[i_core].sample_count > l_time_interval)
+    {
+        // Calculate average utilization for this core
+        l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_util;
+        l_temp32 = l_temp32 * (l_time_interval-1);
+        l_temp32 = l_temp32 + l_core_util*100;
+        g_amec->proc[0].core[i_core].avg_util = l_temp32 / l_time_interval;
+
+        // Calculate average frequency for this core
+        l_temp32 = (uint32_t) g_amec->proc[0].core[i_core].avg_freq;
+        l_temp32 = l_temp32 * (l_time_interval-1);
+        l_temp32 = l_temp32 + l_core_freq*100;
+        g_amec->proc[0].core[i_core].avg_freq = l_temp32 / l_time_interval;
+    }
+  } // else if core offline
 }
 
 // Function Specification
diff --git a/src/occ_405/proc/proc_data.c b/src/occ_405/proc/proc_data.c
index 6f25118..e8e9eba 100755
--- a/src/occ_405/proc/proc_data.c
+++ b/src/occ_405/proc/proc_data.c
@@ -86,6 +86,9 @@ uint32_t G_updated_core_mask = 0;
 // without error.
 uint32_t G_empath_error_core_mask = 0;
 
+//AMEC needs to know cores that are offline
+uint32_t G_core_offline_mask = 0;
+
 //Global G_present_cores is bitmask of all cores
 //(1 = present, 0 = not present. Core 0 has the most significant bit)
 uint32_t G_present_cores = 0;
@@ -128,6 +131,7 @@ void task_core_data( task_t * i_task )
     CoreData  * l_temp = NULL; // Used for pointer swapping
     bulk_core_data_task_t * l_bulk_core_data_ptr = (bulk_core_data_task_t *)i_task->data_ptr;
     ipc_core_data_parms_t * l_parms = (ipc_core_data_parms_t*)(l_bulk_core_data_ptr->gpe_req.cmd_data);
+    static uint32_t L_trace_core_failure = 0;
 
     do
     {
@@ -160,8 +164,8 @@ void task_core_data( task_t * i_task )
         //A request is not considered complete until both the engine job
         //has finished without error and any callback has run to completion.
 
-        if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request)
-            &&
+        if( async_request_completed(&l_bulk_core_data_ptr->gpe_req.request) &&
+            (l_parms->error.rc == 0) &&
             CORE_PRESENT(l_bulk_core_data_ptr->current_core) )
         {
             //If the previous GPE request succeeded then swap core_data_ptr
@@ -184,7 +188,7 @@ void task_core_data( task_t * i_task )
             //Core data has been collected so set the bit in global mask.
             //AMEC code will know which cores to update sensors for. AMEC is
             //responsible for clearing the bit later on.
-            G_updated_core_mask |= CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core);
+            G_updated_core_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
 
             // Presumptively clear the empath error mask
             G_empath_error_core_mask &=
@@ -197,6 +201,24 @@ void task_core_data( task_t * i_task )
         {
             G_core_data_ptrs[l_bulk_core_data_ptr->current_core] = &G_core_data[MAX_NUM_FW_CORES+NUM_CORE_DATA_DOUBLE_BUF+NUM_CORE_DATA_EMPTY_BUF-1];
         }
+        else if(l_parms->error.rc != 0)
+        {
+            // Check if failure is due to being offline (in stop 2 or greater)
+            if(l_parms->error.ffdc == PCB_ERROR_CHIPLET_OFFLINE)
+            {
+                // Mark core offline so it is ignored in control loops and to avoid health monitor logging error
+                G_core_offline_mask |= (CORE0_PRESENT_MASK >> (l_bulk_core_data_ptr->current_core));
+            }
+            else if( !(L_trace_core_failure & (1 << l_bulk_core_data_ptr->current_core)) )
+            {
+               // trace error, if it continues health monitor will see and log error
+               INTR_TRAC_ERR("task_core_data: core %d data collection failed RC[0x%08X] FFDC[0x%08X%08X]",
+                              l_bulk_core_data_ptr->current_core, l_parms->error.rc,
+                              (uint32_t)(l_parms->error.ffdc >> 32),
+                              (uint32_t)l_parms->error.ffdc);
+               L_trace_core_failure |= (1 << l_bulk_core_data_ptr->current_core);
+            }
+        }
 
         //Update current core
         if ( l_bulk_core_data_ptr->current_core >= l_bulk_core_data_ptr->end_core )
@@ -217,6 +239,8 @@ void task_core_data( task_t * i_task )
             //1. Setup the get core data parms
             l_parms->core_num = l_bulk_core_data_ptr->current_core;
             l_parms->data = (CoreData*) l_bulk_core_data_ptr->core_data_ptr;
+            l_parms->error.error = 0;  // default no error
+            l_parms->error.ffdc = 0;
 
             // Static array to record the last timestamp a get_per_core_data task was
             // scheduled for a core.
@@ -243,13 +267,6 @@ void task_core_data( task_t * i_task )
                 {
                     G_get_per_core_data_max_schedule_intervals[l_current_core] = l_elapsed_us;
                 }
-                // Also sniff if the request has actually completed, it is checked above but
-                // the schedule proceeds regardless which could be dangerous...
-                if (!async_request_completed(&l_bulk_core_data_ptr->gpe_req.request))
-                {
-                    INTR_TRAC_ERR("Async get_per_core_data task for core=%d not complete!",
-                             l_current_core);
-                }
             }
 
             //2. Schedule the GPE Request to get the core data
diff --git a/src/occ_405/proc/proc_data.h b/src/occ_405/proc/proc_data.h
index ebef6d0..f8b7870 100755
--- a/src/occ_405/proc/proc_data.h
+++ b/src/occ_405/proc/proc_data.h
@@ -50,9 +50,10 @@
 #define NUM_CORE_DATA_DOUBLE_BUF 2
 #define NUM_CORE_DATA_EMPTY_BUF  1
 
-#define LO_CORES_MASK       0xfff00000
-#define HI_CORES_MASK       0x000fff00
-#define HW_CORES_MASK       0xffffff00
+#define LO_CORES_MASK            0xfff00000
+#define HI_CORES_MASK            0x000fff00
+#define HW_CORES_MASK            0xffffff00
+#define QUAD0_CORES_PRESENT_MASK 0xf0000000
 
 enum eOccProcCores
 {
@@ -103,6 +104,9 @@ extern uint32_t G_present_cores;
 //AMEC needs to know when data for a core has been collected.
 extern uint32_t G_updated_core_mask;
 
+//AMEC needs to know when a core is offline
+extern uint32_t G_core_offline_mask;
+
 // External reference to empath error mask
 extern uint32_t G_empath_error_core_mask;
 
@@ -120,6 +124,17 @@ extern bool G_nest_dts_data_valid;
 #define CORE_EMPATH_ERROR(occ_core_id) \
         ((CORE0_PRESENT_MASK >> occ_core_id) & G_empath_error_core_mask)
 
+// Evaluates to true if the specified core is offline
+#define CORE_OFFLINE(occ_core_id) \
+         ((CORE0_PRESENT_MASK >> occ_core_id) & G_core_offline_mask)
+
+#define CLEAR_CORE_OFFLINE(occ_core_id) \
+         G_core_offline_mask &= ~(CORE0_PRESENT_MASK >> occ_core_id)
+
+// Evaluates to true if the specified quad has at least 1 active present core
+#define QUAD_ONLINE(occ_quad_id) \
+         ( (QUAD0_CORES_PRESENT_MASK >> (occ_quad_id*4)) & ((~G_core_offline_mask) & G_present_cores) )
+
 //Collect bulk core data for all cores in specified range
 void task_core_data( task_t * i_task );
author	mbroyles <mbroyles@us.ibm.com>	2017-11-01 08:44:03 -0500
committer	Christopher J. Cain <cjcain@us.ibm.com>	2017-11-03 15:16:58 -0400
commit	bb703e413c363255a578ffdb35e954a5b71de4c9 (patch)
tree	7c804d5d9b1b6853b832526cefdfa391e1bf3e48
parent	d90b1dcc95523b0ed98ca4fdf4cc6bc84ba40102 (diff)
download	talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.tar.gz talos-occ-bb703e413c363255a578ffdb35e954a5b71de4c9.zip