summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/edac/edac_core.h8
-rw-r--r--drivers/edac/edac_mc.c72
-rw-r--r--include/ras/ras_event.h102
3 files changed, 162 insertions, 20 deletions
diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index f06ce9ab692c..740c7e22c023 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -463,12 +463,12 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
const unsigned long syndrome,
- const int layer0,
- const int layer1,
- const int layer2,
+ const int top_layer,
+ const int mid_layer,
+ const int low_layer,
const char *msg,
const char *other_detail,
- const void *mcelog);
+ const void *arch_log);
/*
* edac_device APIs
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 10f375032e96..ce25750a83f9 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -27,12 +27,17 @@
#include <linux/list.h>
#include <linux/ctype.h>
#include <linux/edac.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/page.h>
#include <asm/edac.h>
#include "edac_core.h"
#include "edac_module.h"
+#define CREATE_TRACE_POINTS
+#define TRACE_INCLUDE_PATH ../../include/ras
+#include <ras/ras_event.h>
+
/* lock to memory controller's control array */
static DEFINE_MUTEX(mem_ctls_mutex);
static LIST_HEAD(mc_devices);
@@ -384,6 +389,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned mc_num,
* which will perform kobj unregistration and the actual free
* will occur during the kobject callback operation
*/
+
return mci;
}
EXPORT_SYMBOL_GPL(edac_mc_alloc);
@@ -902,19 +908,19 @@ static void edac_ce_error(struct mem_ctl_info *mci,
const bool enable_per_layer_report,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
- u32 grain)
+ long grain)
{
unsigned long remapped_page;
if (edac_mc_get_log_ce()) {
if (other_detail && *other_detail)
edac_mc_printk(mci, KERN_WARNING,
- "CE %s on %s (%s%s - %s)\n",
+ "CE %s on %s (%s %s - %s)\n",
msg, label, location,
detail, other_detail);
else
edac_mc_printk(mci, KERN_WARNING,
- "CE %s on %s (%s%s)\n",
+ "CE %s on %s (%s %s)\n",
msg, label, location,
detail);
}
@@ -953,12 +959,12 @@ static void edac_ue_error(struct mem_ctl_info *mci,
if (edac_mc_get_log_ue()) {
if (other_detail && *other_detail)
edac_mc_printk(mci, KERN_WARNING,
- "UE %s on %s (%s%s - %s)\n",
+ "UE %s on %s (%s %s - %s)\n",
msg, label, location, detail,
other_detail);
else
edac_mc_printk(mci, KERN_WARNING,
- "UE %s on %s (%s%s)\n",
+ "UE %s on %s (%s %s)\n",
msg, label, location, detail);
}
@@ -975,27 +981,50 @@ static void edac_ue_error(struct mem_ctl_info *mci,
}
#define OTHER_LABEL " or "
+
+/**
+ * edac_mc_handle_error - reports a memory event to userspace
+ *
+ * @type: severity of the error (CE/UE/Fatal)
+ * @mci: a struct mem_ctl_info pointer
+ * @page_frame_number: mem page where the error occurred
+ * @offset_in_page: offset of the error inside the page
+ * @syndrome: ECC syndrome
+ * @top_layer: Memory layer[0] position
+ * @mid_layer: Memory layer[1] position
+ * @low_layer: Memory layer[2] position
+ * @msg: Message meaningful to the end users that
+ * explains the event
+ * @other_detail: Technical details about the event that
+ * may help hardware manufacturers and
+ * EDAC developers to analyse the event
+ * @arch_log: Architecture-specific struct that can
+ * be used to add extended information to the
+ * tracepoint, like dumping MCE registers.
+ */
void edac_mc_handle_error(const enum hw_event_mc_err_type type,
struct mem_ctl_info *mci,
const unsigned long page_frame_number,
const unsigned long offset_in_page,
const unsigned long syndrome,
- const int layer0,
- const int layer1,
- const int layer2,
+ const int top_layer,
+ const int mid_layer,
+ const int low_layer,
const char *msg,
const char *other_detail,
- const void *mcelog)
+ const void *arch_log)
{
/* FIXME: too much for stack: move it to some pre-alocated area */
char detail[80], location[80];
char label[(EDAC_MC_LABEL_LEN + 1 + sizeof(OTHER_LABEL)) * mci->tot_dimms];
char *p;
int row = -1, chan = -1;
- int pos[EDAC_MAX_LAYERS] = { layer0, layer1, layer2 };
+ int pos[EDAC_MAX_LAYERS] = { top_layer, mid_layer, low_layer };
int i;
- u32 grain;
+ long grain;
bool enable_per_layer_report = false;
+ u16 error_count; /* FIXME: make it a parameter */
+ u8 grain_bits;
debugf3("MC%d: %s()\n", mci->mc_idx, __func__);
@@ -1045,11 +1074,11 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
for (i = 0; i < mci->tot_dimms; i++) {
struct dimm_info *dimm = &mci->dimms[i];
- if (layer0 >= 0 && layer0 != dimm->location[0])
+ if (top_layer >= 0 && top_layer != dimm->location[0])
continue;
- if (layer1 >= 0 && layer1 != dimm->location[1])
+ if (mid_layer >= 0 && mid_layer != dimm->location[1])
continue;
- if (layer2 >= 0 && layer2 != dimm->location[2])
+ if (low_layer >= 0 && low_layer != dimm->location[2])
continue;
/* get the max grain, over the error match range */
@@ -1120,11 +1149,22 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
edac_layer_name[mci->layers[i].type],
pos[i]);
}
+ if (p > location)
+ *(p - 1) = '\0';
+
+ /* Report the error via the trace interface */
+
+ error_count = 1; /* FIXME: allow change it */
+ grain_bits = fls_long(grain) + 1;
+ trace_mc_event(type, msg, label, error_count,
+ mci->mc_idx, top_layer, mid_layer, low_layer,
+ PAGES_TO_MiB(page_frame_number) | offset_in_page,
+ grain_bits, syndrome, other_detail);
/* Memory type dependent details about the error */
if (type == HW_EVENT_ERR_CORRECTED) {
snprintf(detail, sizeof(detail),
- "page:0x%lx offset:0x%lx grain:%d syndrome:0x%lx",
+ "page:0x%lx offset:0x%lx grain:%ld syndrome:0x%lx",
page_frame_number, offset_in_page,
grain, syndrome);
edac_ce_error(mci, pos, msg, location, label, detail,
@@ -1132,7 +1172,7 @@ void edac_mc_handle_error(const enum hw_event_mc_err_type type,
page_frame_number, offset_in_page, grain);
} else {
snprintf(detail, sizeof(detail),
- "page:0x%lx offset:0x%lx grain:%d",
+ "page:0x%lx offset:0x%lx grain:%ld",
page_frame_number, offset_in_page, grain);
edac_ue_error(mci, pos, msg, location, label, detail,
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
new file mode 100644
index 000000000000..260470e72483
--- /dev/null
+++ b/include/ras/ras_event.h
@@ -0,0 +1,102 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ras
+#define TRACE_INCLUDE_FILE ras_event
+
+#if !defined(_TRACE_HW_EVENT_MC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_HW_EVENT_MC_H
+
+#include <linux/tracepoint.h>
+#include <linux/edac.h>
+#include <linux/ktime.h>
+
+/*
+ * Hardware Events Report
+ *
+ * Those events are generated when hardware detected a corrected or
+ * uncorrected event, and are meant to replace the current API to report
+ * errors defined on both EDAC and MCE subsystems.
+ *
+ * FIXME: Add events for handling memory errors originated from the
+ * MCE subsystem.
+ */
+
+/*
+ * Hardware-independent Memory Controller specific events
+ */
+
+/*
+ * Default error mechanisms for Memory Controller errors (CE and UE)
+ */
+TRACE_EVENT(mc_event,
+
+ TP_PROTO(const unsigned int err_type,
+ const char *error_msg,
+ const char *label,
+ const int error_count,
+ const u8 mc_index,
+ const s8 top_layer,
+ const s8 mid_layer,
+ const s8 low_layer,
+ unsigned long address,
+ const u8 grain_bits,
+ unsigned long syndrome,
+ const char *driver_detail),
+
+ TP_ARGS(err_type, error_msg, label, error_count, mc_index,
+ top_layer, mid_layer, low_layer, address, grain_bits,
+ syndrome, driver_detail),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, error_type )
+ __string( msg, error_msg )
+ __string( label, label )
+ __field( u16, error_count )
+ __field( u8, mc_index )
+ __field( s8, top_layer )
+ __field( s8, middle_layer )
+ __field( s8, lower_layer )
+ __field( long, address )
+ __field( u8, grain_bits )
+ __field( long, syndrome )
+ __string( driver_detail, driver_detail )
+ ),
+
+ TP_fast_assign(
+ __entry->error_type = err_type;
+ __assign_str(msg, error_msg);
+ __assign_str(label, label);
+ __entry->error_count = error_count;
+ __entry->mc_index = mc_index;
+ __entry->top_layer = top_layer;
+ __entry->middle_layer = mid_layer;
+ __entry->lower_layer = low_layer;
+ __entry->address = address;
+ __entry->grain_bits = grain_bits;
+ __entry->syndrome = syndrome;
+ __assign_str(driver_detail, driver_detail);
+ ),
+
+ TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
+ __entry->error_count,
+ (__entry->error_type == HW_EVENT_ERR_CORRECTED) ? "Corrected" :
+ ((__entry->error_type == HW_EVENT_ERR_FATAL) ?
+ "Fatal" : "Uncorrected"),
+ __entry->error_count > 1 ? "s" : "",
+ ((char *)__get_str(msg))[0] ? " " : "",
+ __get_str(msg),
+ __get_str(label),
+ __entry->mc_index,
+ __entry->top_layer,
+ __entry->middle_layer,
+ __entry->lower_layer,
+ __entry->address,
+ 1 << __entry->grain_bits,
+ __entry->syndrome,
+ ((char *)__get_str(driver_detail))[0] ? " " : "",
+ __get_str(driver_detail))
+);
+
+#endif /* _TRACE_HW_EVENT_MC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
OpenPOWER on IntegriCloud