summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/x86_64/boot-options.txt5
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/mce.h16
-rw-r--r--arch/x86/kernel/acpi/boot.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c28
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c42
-rw-r--r--drivers/acpi/apei/erst.c51
-rw-r--r--drivers/acpi/apei/ghes.c38
-rw-r--r--drivers/acpi/apei/hest.c39
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/memory-failure.c5
12 files changed, 185 insertions, 50 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index e9e8ddbbf376..1228b22e142b 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -176,6 +176,11 @@ ACPI
acpi=noirq Don't route interrupts
+ acpi=nocmcff Disable firmware first mode for corrected errors. This
+ disables parsing the HEST CMC error source to check if
+ firmware has set the FF flag. This may result in
+ duplicate corrected error reports.
+
PCI
pci=off Don't use PCI
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 2dfac58f3b11..b1977bad5435 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
extern int acpi_skip_timer_override;
extern int acpi_use_timer_override;
extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
extern u8 acpi_sci_flags;
extern int acpi_sci_override_gsi;
@@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
#define acpi_lapic 0
#define acpi_ioapic 0
+#define acpi_disable_cmcff 0
static inline void acpi_noirq_set(void) { }
static inline void acpi_disable_pci(void) { }
static inline void disable_acpi(void) { }
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 29e3093bbd21..cbe6b9e404ce 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -32,11 +32,20 @@
#define MCI_STATUS_PCC (1ULL<<57) /* processor context corrupt */
#define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */
#define MCI_STATUS_AR (1ULL<<55) /* Action required */
-#define MCACOD 0xffff /* MCA Error Code */
+
+/*
+ * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
+ * bits 15:0. But bit 12 is the 'F' bit, defined for corrected
+ * errors to indicate that errors are being filtered by hardware.
+ * We should mask out bit 12 when looking for specific signatures
+ * of uncorrected errors - so the F bit is deliberately skipped
+ * in this #define.
+ */
+#define MCACOD 0xefff /* MCA Error Code */
/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */
#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */
-#define MCACOD_SCRUBMSK 0xfff0
+#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */
#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */
#define MCACOD_DATA 0x0134 /* Data Load */
#define MCACOD_INSTR 0x0150 /* Instruction Fetch */
@@ -188,6 +197,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
const char __user *ubuf,
size_t usize, loff_t *off));
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_bank(int bank);
+
/*
* Exception handler
*/
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2627a81253ee..fead83292e6e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
int acpi_lapic;
int acpi_ioapic;
int acpi_strict;
+int acpi_disable_cmcff;
u8 acpi_sci_flags __initdata;
int acpi_sci_override_gsi __initdata;
@@ -1626,6 +1627,10 @@ static int __init parse_acpi(char *arg)
/* "acpi=copy_dsdt" copys DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
+ }
+ /* "acpi=nocmcff" disables FF mode for corrected errors */
+ else if (strcmp(arg, "nocmcff") == 0) {
+ acpi_disable_cmcff = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa5d3b7..09edd0b65fef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -25,15 +25,18 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
void mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
+void cmci_disable_bank(int bank);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
static inline void mce_intel_cmci_poll(void) { }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+static inline void cmci_disable_bank(int bank) { }
#endif
void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 87a65c939bcd..b3218cdee95f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -97,6 +97,15 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+/*
+ * MCA banks controlled through firmware first for corrected errors.
+ * This is a global list of banks for which we won't enable CMCI and we
+ * won't poll. Firmware controls these banks and is responsible for
+ * reporting corrected errors through GHES. Uncorrected/recoverable
+ * errors are still notified through a machine check.
+ */
+mce_banks_t mce_banks_ce_disabled;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1935,6 +1944,25 @@ static struct miscdevice mce_chrdev_device = {
&mce_chrdev_ops,
};
+static void __mce_disable_bank(void *arg)
+{
+ int bank = *((int *)arg);
+ __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ cmci_disable_bank(bank);
+}
+
+void mce_disable_bank(int bank)
+{
+ if (bank >= mca_cfg.banks) {
+ pr_warn(FW_BUG
+ "Ignoring request to disable invalid MCA bank %d.\n",
+ bank);
+ return;
+ }
+ set_bit(bank, mce_banks_ce_disabled);
+ on_each_cpu(__mce_disable_bank, &bank, 1);
+}
+
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index d56405309dc1..4cfe0458ca66 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -203,6 +203,10 @@ static void cmci_discover(int banks)
if (test_bit(i, owned))
continue;
+ /* Skip banks in firmware first mode */
+ if (test_bit(i, mce_banks_ce_disabled))
+ continue;
+
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
@@ -271,6 +275,19 @@ void cmci_recheck(void)
local_irq_restore(flags);
}
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+ u64 val;
+
+ if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+ return;
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ val &= ~MCI_CTL2_CMCI_EN;
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ __clear_bit(bank, __get_cpu_var(mce_banks_owned));
+}
+
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
@@ -280,20 +297,12 @@ void cmci_clear(void)
unsigned long flags;
int i;
int banks;
- u64 val;
if (!cmci_supported(&banks))
return;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- for (i = 0; i < banks; i++) {
- if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
- continue;
- /* Disable CMCI */
- rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(i), val);
- __clear_bit(i, __get_cpu_var(mce_banks_owned));
- }
+ for (i = 0; i < banks; i++)
+ __cmci_disable_bank(i);
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
@@ -327,6 +336,19 @@ void cmci_reenable(void)
cmci_discover(banks);
}
+void cmci_disable_bank(int bank)
+{
+ int banks;
+ unsigned long flags;
+
+ if (!cmci_supported(&banks))
+ return;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ __cmci_disable_bank(bank);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
static void intel_init_cmci(void)
{
int banks;
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 88d0b0f9f92b..40ffcdafbc6f 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -39,7 +39,8 @@
#include "apei-internal.h"
-#define ERST_PFX "ERST: "
+#undef pr_fmt
+#define pr_fmt(fmt) "ERST: " fmt
/* ERST command status */
#define ERST_STATUS_SUCCESS 0x0
@@ -109,8 +110,7 @@ static inline int erst_errno(int command_status)
static int erst_timedout(u64 *t, u64 spin_unit)
{
if ((s64)*t < spin_unit) {
- pr_warning(FW_WARN ERST_PFX
- "Firmware does not respond in time\n");
+ pr_warn(FW_WARN "Firmware does not respond in time.\n");
return 1;
}
*t -= spin_unit;
@@ -186,8 +186,8 @@ static int erst_exec_stall(struct apei_exec_context *ctx,
if (ctx->value > FIRMWARE_MAX_STALL) {
if (!in_nmi())
- pr_warning(FW_WARN ERST_PFX
- "Too long stall time for stall instruction: %llx.\n",
+ pr_warn(FW_WARN
+ "Too long stall time for stall instruction: 0x%llx.\n",
ctx->value);
stall_time = FIRMWARE_MAX_STALL;
} else
@@ -206,8 +206,8 @@ static int erst_exec_stall_while_true(struct apei_exec_context *ctx,
if (ctx->var1 > FIRMWARE_MAX_STALL) {
if (!in_nmi())
- pr_warning(FW_WARN ERST_PFX
- "Too long stall time for stall while true instruction: %llx.\n",
+ pr_warn(FW_WARN
+ "Too long stall time for stall while true instruction: 0x%llx.\n",
ctx->var1);
stall_time = FIRMWARE_MAX_STALL;
} else
@@ -271,8 +271,7 @@ static int erst_exec_move_data(struct apei_exec_context *ctx,
/* ioremap does not work in interrupt context */
if (in_interrupt()) {
- pr_warning(ERST_PFX
- "MOVE_DATA can not be used in interrupt context");
+ pr_warn("MOVE_DATA can not be used in interrupt context.\n");
return -EBUSY;
}
@@ -522,8 +521,7 @@ retry:
ERST_RECORD_ID_CACHE_SIZE_MAX);
if (new_size <= erst_record_id_cache.size) {
if (printk_ratelimit())
- pr_warning(FW_WARN ERST_PFX
- "too many record ID!\n");
+ pr_warn(FW_WARN "too many record IDs!\n");
return 0;
}
alloc_size = new_size * sizeof(entries[0]);
@@ -759,8 +757,7 @@ static int __erst_clear_from_storage(u64 record_id)
static void pr_unimpl_nvram(void)
{
if (printk_ratelimit())
- pr_warning(ERST_PFX
- "NVRAM ERST Log Address Range is not implemented yet\n");
+ pr_warn("NVRAM ERST Log Address Range not implemented yet.\n");
}
static int __erst_write_to_nvram(const struct cper_record_header *record)
@@ -1120,7 +1117,7 @@ static int __init erst_init(void)
goto err;
if (erst_disable) {
- pr_info(ERST_PFX
+ pr_info(
"Error Record Serialization Table (ERST) support is disabled.\n");
goto err;
}
@@ -1131,14 +1128,14 @@ static int __init erst_init(void)
goto err;
else if (ACPI_FAILURE(status)) {
const char *msg = acpi_format_exception(status);
- pr_err(ERST_PFX "Failed to get table, %s\n", msg);
+ pr_err("Failed to get table, %s\n", msg);
rc = -EINVAL;
goto err;
}
rc = erst_check_table(erst_tab);
if (rc) {
- pr_err(FW_BUG ERST_PFX "ERST table is invalid\n");
+ pr_err(FW_BUG "ERST table is invalid.\n");
goto err;
}
@@ -1156,21 +1153,19 @@ static int __init erst_init(void)
rc = erst_get_erange(&erst_erange);
if (rc) {
if (rc == -ENODEV)
- pr_info(ERST_PFX
+ pr_info(
"The corresponding hardware device or firmware implementation "
"is not available.\n");
else
- pr_err(ERST_PFX
- "Failed to get Error Log Address Range.\n");
+ pr_err("Failed to get Error Log Address Range.\n");
goto err_unmap_reg;
}
r = request_mem_region(erst_erange.base, erst_erange.size, "APEI ERST");
if (!r) {
- pr_err(ERST_PFX
- "Can not request iomem region <0x%16llx-0x%16llx> for ERST.\n",
- (unsigned long long)erst_erange.base,
- (unsigned long long)erst_erange.base + erst_erange.size);
+ pr_err("Can not request [mem %#010llx-%#010llx] for ERST.\n",
+ (unsigned long long)erst_erange.base,
+ (unsigned long long)erst_erange.base + erst_erange.size - 1);
rc = -EIO;
goto err_unmap_reg;
}
@@ -1180,7 +1175,7 @@ static int __init erst_init(void)
if (!erst_erange.vaddr)
goto err_release_erange;
- pr_info(ERST_PFX
+ pr_info(
"Error Record Serialization Table (ERST) support is initialized.\n");
buf = kmalloc(erst_erange.size, GFP_KERNEL);
@@ -1192,15 +1187,15 @@ static int __init erst_init(void)
rc = pstore_register(&erst_info);
if (rc) {
if (rc != -EPERM)
- pr_info(ERST_PFX
- "Could not register with persistent store\n");
+ pr_info(
+ "Could not register with persistent store.\n");
erst_info.buf = NULL;
erst_info.bufsize = 0;
kfree(buf);
}
} else
- pr_err(ERST_PFX
- "Failed to allocate %lld bytes for persistent store error log\n",
+ pr_err(
+ "Failed to allocate %lld bytes for persistent store error log.\n",
erst_erange.size);
return 0;
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ec9b57d428a1..8ec37bbdd699 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -409,6 +409,34 @@ static void ghes_clear_estatus(struct ghes *ghes)
ghes->flags &= ~GHES_TO_CLEAR;
}
+static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
+{
+#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
+ unsigned long pfn;
+ int sec_sev = ghes_severity(gdata->error_severity);
+ struct cper_sec_mem_err *mem_err;
+ mem_err = (struct cper_sec_mem_err *)(gdata + 1);
+
+ if (sec_sev == GHES_SEV_CORRECTED &&
+ (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
+ (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
+ pfn = mem_err->physical_addr >> PAGE_SHIFT;
+ if (pfn_valid(pfn))
+ memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+ else if (printk_ratelimit())
+ pr_warn(FW_WARN GHES_PFX
+ "Invalid address in generic error data: %#llx\n",
+ mem_err->physical_addr);
+ }
+ if (sev == GHES_SEV_RECOVERABLE &&
+ sec_sev == GHES_SEV_RECOVERABLE &&
+ mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
+ pfn = mem_err->physical_addr >> PAGE_SHIFT;
+ memory_failure_queue(pfn, 0, 0);
+ }
+#endif
+}
+
static void ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
@@ -428,15 +456,7 @@ static void ghes_do_proc(struct ghes *ghes,
apei_mce_report_mem_error(sev == GHES_SEV_CORRECTED,
mem_err);
#endif
-#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
- if (sev == GHES_SEV_RECOVERABLE &&
- sec_sev == GHES_SEV_RECOVERABLE &&
- mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS) {
- unsigned long pfn;
- pfn = mem_err->physical_addr >> PAGE_SHIFT;
- memory_failure_queue(pfn, 0, 0);
- }
-#endif
+ ghes_handle_memory_failure(gdata, sev);
}
#ifdef CONFIG_ACPI_APEI_PCIEAER
else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index f5ef5d54e4ac..f5e37f32c71f 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/platform_device.h>
#include <acpi/apei.h>
+#include <asm/mce.h>
#include "apei-internal.h"
@@ -121,6 +122,41 @@ int apei_hest_parse(apei_hest_func_t func, void *data)
}
EXPORT_SYMBOL_GPL(apei_hest_parse);
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+ return 0;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
struct ghes_arr {
struct platform_device **ghes_devs;
unsigned int count;
@@ -227,6 +263,9 @@ void __init acpi_hest_init(void)
goto err;
}
+ if (!acpi_disable_cmcff)
+ apei_hest_parse(hest_parse_cmc, NULL);
+
if (!ghes_disable) {
rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
if (rc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f0224608d15e..d2d59b4149d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1798,6 +1798,7 @@ enum mf_flags {
MF_COUNT_INCREASED = 1 << 0,
MF_ACTION_REQUIRED = 1 << 1,
MF_MUST_KILL = 1 << 2,
+ MF_SOFT_OFFLINE = 1 << 3,
};
extern int memory_failure(unsigned long pfn, int trapno, int flags);
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2c13aa7a0164..55d7c8026ab0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1286,7 +1286,10 @@ static void memory_failure_work_func(struct work_struct *work)
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
if (!gotten)
break;
- memory_failure(entry.pfn, entry.trapno, entry.flags);
+ if (entry.flags & MF_SOFT_OFFLINE)
+ soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+ else
+ memory_failure(entry.pfn, entry.trapno, entry.flags);
}
}
OpenPOWER on IntegriCloud