summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/Makefile1
-rw-r--r--arch/x86/kernel/acpi/apei.c62
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c18
-rw-r--r--arch/x86/kernel/apm_32.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c348
-rw-r--r--arch/x86/kernel/cpu/common.c30
-rw-r--r--arch/x86/kernel/cpu/intel.c52
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c18
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh51
-rw-r--r--arch/x86/kernel/cpu/perf_event.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event.h12
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_uncore.c111
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c78
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c16
-rw-r--r--arch/x86/kernel/cpu/proc.c8
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/crash.c563
-rw-r--r--arch/x86/kernel/entry_32.S22
-rw-r--r--arch/x86/kernel/entry_64.S80
-rw-r--r--arch/x86/kernel/espfix_64.c5
-rw-r--r--arch/x86/kernel/ftrace.c3
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c553
-rw-r--r--arch/x86/kernel/kprobes/core.c3
-rw-r--r--arch/x86/kernel/machine_kexec_64.c239
-rw-r--r--arch/x86/kernel/mcount_64.S13
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/pmc_atom.c321
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--arch/x86/kernel/resource.c8
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/traps.c7
-rw-r--r--arch/x86/kernel/tsc.c32
-rw-r--r--arch/x86/kernel/vsyscall_64.c8
-rw-r--r--arch/x86/kernel/vsyscall_gtod.c23
-rw-r--r--arch/x86/kernel/xsave.c118
42 files changed, 2421 insertions, 451 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 047f9ff2e36c..b5ea75c4a4b4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
+obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
@@ -117,4 +118,5 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
+ obj-$(CONFIG_KEXEC) += kexec-bzimage64.o
endif
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b22581472..3242e591fa82 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,5 +1,6 @@
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_APEI) += apei.o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
new file mode 100644
index 000000000000..c280df6b2aa2
--- /dev/null
+++ b/arch/x86/kernel/acpi/apei.c
@@ -0,0 +1,62 @@
+/*
+ * Arch-specific APEI-related functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <acpi/apei.h>
+
+#include <asm/mce.h>
+#include <asm/tlbflush.h>
+
+int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
+{
+#ifdef CONFIG_X86_MCE
+ int i;
+ struct acpi_hest_ia_corrected *cmc;
+ struct acpi_hest_ia_error_bank *mc_bank;
+
+ if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+ return 0;
+
+ cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+ if (!cmc->enabled)
+ return 0;
+
+ /*
+ * We expect HEST to provide a list of MC banks that report errors
+ * in firmware first mode. Otherwise, return non-zero value to
+ * indicate that we are done parsing HEST.
+ */
+ if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
+ !cmc->num_hardware_banks)
+ return 1;
+
+ pr_info("HEST: Enabling Firmware First mode for corrected errors.\n");
+
+ mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+ for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+ mce_disable_bank(mc_bank->bank_number);
+#endif
+ return 1;
+}
+
+void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
+{
+#ifdef CONFIG_X86_MCE
+ apei_mce_report_mem_error(sev, mem_err);
+#endif
+}
+
+void arch_apei_flush_tlb_one(unsigned long addr)
+{
+ __flush_tlb_one(addr);
+}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 86281ffb96d6..a531f6564ed0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -74,10 +74,6 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
-#ifndef __HAVE_ARCH_CMPXCHG
-#warning ACPI uses CMPXCHG, i486 and later hardware
-#endif
-
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index c3fcb5de5083..6a1e71bde323 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
/* "in progress" flag of arch_trigger_all_cpu_backtrace */
static unsigned long backtrace_flag;
-void arch_trigger_all_cpu_backtrace(void)
+void arch_trigger_all_cpu_backtrace(bool include_self)
{
int i;
+ int cpu = get_cpu();
- if (test_and_set_bit(0, &backtrace_flag))
+ if (test_and_set_bit(0, &backtrace_flag)) {
/*
* If there is already a trigger_all_cpu_backtrace() in progress
* (backtrace_flag == 1), don't output double cpu dump infos.
*/
+ put_cpu();
return;
+ }
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+ if (!include_self)
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
+ if (!cpumask_empty(to_cpumask(backtrace_mask))) {
+ pr_info("sending NMI to %s CPUs:\n",
+ (include_self ? "all" : "other"));
+ apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
+ }
/* Wait for up to 10 seconds for all CPUs to do the backtrace */
for (i = 0; i < 10 * 1000; i++) {
if (cpumask_empty(to_cpumask(backtrace_mask)))
break;
mdelay(1);
+ touch_softlockup_watchdog();
}
clear_bit(0, &backtrace_flag);
smp_mb__after_atomic();
+ put_cpu();
}
static int
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index f3a1f04ed4cb..584874451414 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -841,7 +841,6 @@ static int apm_do_idle(void)
u32 eax;
u8 ret = 0;
int idled = 0;
- int polling;
int err = 0;
if (!need_resched()) {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ce8b8ff0e0ef..60e5497681f5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
+#include <asm/smp.h>
#include <asm/pci-direct.h>
#ifdef CONFIG_X86_64
@@ -50,7 +51,6 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
return wrmsr_safe_regs(gprs);
}
-#ifdef CONFIG_X86_32
/*
* B step AMD K6 before B 9730xxxx have hardware bugs that can cause
* misexecution of code under Linux. Owners of such processors should
@@ -70,6 +70,7 @@ __asm__(".globl vide\n\t.align 4\nvide: ret");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
@@ -83,11 +84,12 @@ static void init_amd_k5(struct cpuinfo_x86 *c)
if (inl(CBAR) & CBAR_ENB)
outl(0 | CBAR_KEY, CBAR);
}
+#endif
}
-
static void init_amd_k6(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
u32 l, h;
int mbytes = get_num_physpages() >> (20-PAGE_SHIFT);
@@ -176,10 +178,44 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
/* placeholder for any needed mods */
return;
}
+#endif
}
-static void amd_k7_smp_check(struct cpuinfo_x86 *c)
+static void init_amd_k7(struct cpuinfo_x86 *c)
{
+#ifdef CONFIG_X86_32
+ u32 l, h;
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
+ * If the BIOS didn't enable it already, enable it here.
+ */
+ if (c->x86_model >= 6 && c->x86_model <= 10) {
+ if (!cpu_has(c, X86_FEATURE_XMM)) {
+ printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+ msr_clear_bit(MSR_K7_HWCR, 15);
+ set_cpu_cap(c, X86_FEATURE_XMM);
+ }
+ }
+
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
+ * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+ * As per AMD technical note 27212 0.2
+ */
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+ rdmsr(MSR_K7_CLK_CTL, l, h);
+ if ((l & 0xfff00000) != 0x20000000) {
+ printk(KERN_INFO
+ "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
+ l, ((l & 0x000fffff)|0x20000000));
+ wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+ }
+ }
+
+ set_cpu_cap(c, X86_FEATURE_K7);
+
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -207,7 +243,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
((c->x86_model == 7) && (c->x86_mask >= 1)) ||
(c->x86_model > 7))
- if (cpu_has_mp)
+ if (cpu_has(c, X86_FEATURE_MP))
return;
/* If we get here, not a certified SMP capable AMD system. */
@@ -219,45 +255,8 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: This combination of AMD"
" processors is not suitable for SMP.\n");
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
-}
-
-static void init_amd_k7(struct cpuinfo_x86 *c)
-{
- u32 l, h;
-
- /*
- * Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
- * If the BIOS didn't enable it already, enable it here.
- */
- if (c->x86_model >= 6 && c->x86_model <= 10) {
- if (!cpu_has(c, X86_FEATURE_XMM)) {
- printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
- msr_clear_bit(MSR_K7_HWCR, 15);
- set_cpu_cap(c, X86_FEATURE_XMM);
- }
- }
-
- /*
- * It's been determined by AMD that Athlons since model 8 stepping 1
- * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
- * As per AMD technical note 27212 0.2
- */
- if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
- rdmsr(MSR_K7_CLK_CTL, l, h);
- if ((l & 0xfff00000) != 0x20000000) {
- printk(KERN_INFO
- "CPU: CLK_CTL MSR was %x. Reprogramming to %x\n",
- l, ((l & 0x000fffff)|0x20000000));
- wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
- }
- }
-
- set_cpu_cap(c, X86_FEATURE_K7);
-
- amd_k7_smp_check(c);
-}
#endif
+}
#ifdef CONFIG_NUMA
/*
@@ -446,6 +445,26 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
+
+#ifdef CONFIG_X86_64
+ if (c->x86 >= 0xf) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+ unsigned long pfn = tseg >> PAGE_SHIFT;
+
+ printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+ if (pfn_range_is_mapped(pfn, pfn + 1))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
+ }
+#endif
+
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
@@ -515,101 +534,74 @@ static const int amd_erratum_383[];
static const int amd_erratum_400[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
-static void init_amd(struct cpuinfo_x86 *c)
+static void init_amd_k8(struct cpuinfo_x86 *c)
{
- u32 dummy;
- unsigned long long value;
+ u32 level;
+ u64 value;
-#ifdef CONFIG_SMP
- /*
- * Disable TLB flush filter by setting HWCR.FFDIS on K8
- * bit 6 of msr C001_0015
- *
- * Errata 63 for SH-B3 steppings
- * Errata 122 for all steppings (F+ have it disabled by default)
- */
- if (c->x86 == 0xf)
- msr_set_bit(MSR_K7_HWCR, 6);
-#endif
-
- early_init_amd(c);
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ * Some BIOSes incorrectly force this feature, but only K8 revision D
+ * (model = 0x14) and later actually support it.
+ * (AMD Erratum #110, docId: 25759).
*/
- clear_cpu_cap(c, 0*32+31);
-
-#ifdef CONFIG_X86_64
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- if (c->x86 == 0xf) {
- u32 level;
-
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- /*
- * Some BIOSes incorrectly force this feature, but only K8
- * revision D (model = 0x14) and later actually support it.
- * (AMD Erratum #110, docId: 25759).
- */
- if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
- clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
- if (!rdmsrl_amd_safe(0xc001100d, &value)) {
- value &= ~(1ULL << 32);
- wrmsrl_amd_safe(0xc001100d, value);
- }
+ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) {
+ clear_cpu_cap(c, X86_FEATURE_LAHF_LM);
+ if (!rdmsrl_amd_safe(0xc001100d, &value)) {
+ value &= ~BIT_64(32);
+ wrmsrl_amd_safe(0xc001100d, value);
}
-
}
- if (c->x86 >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- /* get apicid instead of initial apic id from cpuid */
- c->apicid = hard_smp_processor_id();
-#else
+ if (!c->x86_model_id[0])
+ strcpy(c->x86_model_id, "Hammer");
+}
+
+static void init_amd_gh(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_64
+ /* do this for boot cpu */
+ if (c == &boot_cpu_data)
+ check_enable_amd_mmconf_dmi();
+
+ fam10h_check_enable_mmcfg();
+#endif
/*
- * FIXME: We should handle the K5 here. Set up the write
- * range and also turn on MSR 83 bits 4 and 31 (write alloc,
- * no bus pipeline)
+ * Disable GART TLB Walk Errors on Fam10h. We do this here because this
+ * is always needed when GART is enabled, even in a kernel which has no
+ * MCE support built in. BIOS should disable GartTlbWlk Errors already.
+ * If it doesn't, we do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
*/
+ msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
- switch (c->x86) {
- case 4:
- init_amd_k5(c);
- break;
- case 5:
- init_amd_k6(c);
- break;
- case 6: /* An Athlon/Duron */
- init_amd_k7(c);
- break;
- }
+ /*
+ * On family 10h BIOS may not have properly enabled WC+ support, causing
+ * it to be converted to CD memtype. This may result in performance
+ * degradation for certain nested-paging guests. Prevent this conversion
+ * by clearing bit 24 in MSR_AMD64_BU_CFG2.
+ *
+ * NOTE: we want to use the _safe accessors so as not to #GP kvm
+ * guests on older kvm hosts.
+ */
+ msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
- /* K6s reports MCEs but don't actually have all the MSRs */
- if (c->x86 < 6)
- clear_cpu_cap(c, X86_FEATURE_MCE);
-#endif
+ if (cpu_has_amd_erratum(c, amd_erratum_383))
+ set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
+}
- /* Enable workaround for FXSAVE leak */
- if (c->x86 >= 6)
- set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
- if (!c->x86_model_id[0]) {
- switch (c->x86) {
- case 0xf:
- /* Should distinguish Models here, but this is only
- a fallback anyways. */
- strcpy(c->x86_model_id, "Hammer");
- break;
- }
- }
+static void init_amd_bd(struct cpuinfo_x86 *c)
+{
+ u64 value;
/* re-enable TopologyExtensions if switched off by BIOS */
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
+ if ((c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
!cpu_has(c, X86_FEATURE_TOPOEXT)) {
if (msr_set_bit(0xc0011005, 54) > 0) {
@@ -625,14 +617,60 @@ static void init_amd(struct cpuinfo_x86 *c)
* The way access filter has a performance penalty on some workloads.
* Disable it on the affected CPUs.
*/
- if ((c->x86 == 0x15) &&
- (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-
+ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
if (!rdmsrl_safe(0xc0011021, &value) && !(value & 0x1E)) {
value |= 0x1E;
wrmsrl_safe(0xc0011021, value);
}
}
+}
+
+static void init_amd(struct cpuinfo_x86 *c)
+{
+ u32 dummy;
+
+#ifdef CONFIG_SMP
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
+ * bit 6 of msr C001_0015
+ *
+ * Errata 63 for SH-B3 steppings
+ * Errata 122 for all steppings (F+ have it disabled by default)
+ */
+ if (c->x86 == 0xf)
+ msr_set_bit(MSR_K7_HWCR, 6);
+#endif
+
+ early_init_amd(c);
+
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
+ if (c->x86 >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+
+ /* get apicid instead of initial apic id from cpuid */
+ c->apicid = hard_smp_processor_id();
+
+ /* K6s reports MCEs but don't actually have all the MSRs */
+ if (c->x86 < 6)
+ clear_cpu_cap(c, X86_FEATURE_MCE);
+
+ switch (c->x86) {
+ case 4: init_amd_k5(c); break;
+ case 5: init_amd_k6(c); break;
+ case 6: init_amd_k7(c); break;
+ case 0xf: init_amd_k8(c); break;
+ case 0x10: init_amd_gh(c); break;
+ case 0x15: init_amd_bd(c); break;
+ }
+
+ /* Enable workaround for FXSAVE leak */
+ if (c->x86 >= 6)
+ set_cpu_bug(c, X86_BUG_FXSAVE_LEAK);
cpu_detect_cache_sizes(c);
@@ -656,33 +694,6 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
-#ifdef CONFIG_X86_64
- if (c->x86 == 0x10) {
- /* do this for boot cpu */
- if (c == &boot_cpu_data)
- check_enable_amd_mmconf_dmi();
-
- fam10h_check_enable_mmcfg();
- }
-
- if (c == &boot_cpu_data && c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- printk(KERN_DEBUG "tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
/*
* Family 0x12 and above processors have APIC timer
* running in deep C states.
@@ -690,34 +701,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (c->x86 == 0x10) {
- /*
- * Disable GART TLB Walk Errors on Fam10h. We do this here
- * because this is always needed when GART is enabled, even in a
- * kernel which has no MCE support built in.
- * BIOS should disable GartTlbWlk Errors already. If
- * it doesn't, do it here as suggested by the BKDG.
- *
- * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
- */
- msr_set_bit(MSR_AMD64_MCx_MASK(4), 10);
-
- /*
- * On family 10h BIOS may not have properly enabled WC+ support,
- * causing it to be converted to CD memtype. This may result in
- * performance degradation for certain nested-paging guests.
- * Prevent this conversion by clearing bit 24 in
- * MSR_AMD64_BU_CFG2.
- *
- * NOTE: we want to use the _safe accessors so as not to #GP kvm
- * guests on older kvm hosts.
- */
- msr_clear_bit(MSR_AMD64_BU_CFG2, 24);
-
- if (cpu_has_amd_erratum(c, amd_erratum_383))
- set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH);
- }
-
if (cpu_has_amd_erratum(c, amd_erratum_400))
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
@@ -741,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
}
#endif
-static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
- tlb_flushall_shift = 6;
-}
-
static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
{
u32 ebx, eax, ecx, edx;
@@ -793,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
tlb_lli_2m[ENTRIES] = eax & mask;
tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
-
- cpu_set_tlb_flushall_shift(c);
}
static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ef1b93f18ed1..e4ab2b42bd6f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
}
__setup("noxsaveopt", x86_xsaveopt_setup);
+static int __init x86_xsaves_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -481,26 +489,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8 __read_mostly tlb_flushall_shift = -1;
-
void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
- "tlb_flushall_shift: %d\n",
+ "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+ tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -634,6 +633,15 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[9] = ebx;
}
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+
+ c->x86_capability[10] = eax;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf2..74e804ddc5c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -253,7 +253,7 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
*/
if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
(c->x86_mask < 0x6 || c->x86_mask == 0xb))
- set_cpu_cap(c, X86_FEATURE_11AP);
+ set_cpu_bug(c, X86_BUG_11AP);
#ifdef CONFIG_X86_INTEL_USERCOPY
@@ -370,6 +370,17 @@ static void init_intel(struct cpuinfo_x86 *c)
*/
detect_extended_topology(c);
+ if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+ /*
+ * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+ * detection.
+ */
+ c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+ detect_ht(c);
+#endif
+ }
+
l2 = init_intel_cacheinfo(c);
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
@@ -391,7 +402,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && cpu_has_clflush &&
(c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
- set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
+ set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR);
#ifdef CONFIG_X86_64
if (c->x86 == 15)
@@ -438,17 +449,6 @@ static void init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_P3);
#endif
- if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
- /*
- * let's use the legacy cpuid vector 0x1 and 0x4 for topology
- * detection.
- */
- c->x86_max_cores = intel_num_cpu_cores(c);
-#ifdef CONFIG_X86_32
- detect_ht(c);
-#endif
- }
-
/* Work around errata */
srat_detect_node(c);
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
}
}
-static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
- switch ((c->x86 << 8) + c->x86_model) {
- case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 0x61d: /* six-core 45 nm xeon "Dunnington" */
- tlb_flushall_shift = -1;
- break;
- case 0x63a: /* Ivybridge */
- tlb_flushall_shift = 2;
- break;
- case 0x61a: /* 45 nm nehalem, "Bloomfield" */
- case 0x61e: /* 45 nm nehalem, "Lynnfield" */
- case 0x625: /* 32 nm nehalem, "Clarkdale" */
- case 0x62c: /* 32 nm nehalem, "Gulftown" */
- case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
- case 0x62f: /* 32 nm Xeon E7 */
- case 0x62a: /* SandyBridge */
- case 0x62d: /* SandyBridge, "Romely-EP" */
- default:
- tlb_flushall_shift = 6;
- }
-}
-
static void intel_detect_tlb(struct cpuinfo_x86 *c)
{
int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
for (j = 1 ; j < 16 ; j++)
intel_tlb_lookup(desc[j]);
}
- intel_tlb_flushall_shift_set(c);
}
static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a952e9c85b6f..c7035073dfc1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- if (strict_strtoul(buf, 10, &val) < 0)
+ if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
@@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return -EINVAL;
- if (strict_strtoul(buf, 16, &val) < 0)
+ if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
if (amd_set_subcaches(cpu, val))
@@ -730,6 +730,18 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
#endif
}
+#ifdef CONFIG_X86_HT
+ /*
+ * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
+ * turns means that the only possibility is SMT (as indicated in
+ * cpuid1). Since cpuid2 doesn't specify shared caches, and we know
+ * that SMT shares all caches, we can unconditionally set cpu_llc_id to
+ * c->phys_proc_id.
+ */
+ if (per_cpu(cpu_llc_id, cpu) == BAD_APICID)
+ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+#endif
+
c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
return l2;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index bb92f38153b2..bd9ccda8087f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
@@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.ignore_ce ^ !!new) {
@@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.cmci_disabled ^ !!new) {
@@ -2385,6 +2385,10 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!(action & CPU_TASKS_FROZEN))
+ cmci_rediscover();
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
@@ -2396,11 +2400,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
}
- if (action == CPU_POST_DEAD) {
- /* intentionally ignoring frozen here */
- cmci_rediscover();
- }
-
return NOTIFY_OK;
}
@@ -2451,6 +2450,12 @@ static __init int mcheck_init_device(void)
for_each_online_cpu(i) {
err = mce_device_create(i);
if (err) {
+ /*
+ * Register notifier anyway (and do not unreg it) so
+ * that we don't leave undeleted timers, see notifier
+ * callback above.
+ */
+ __register_hotcpu_notifier(&mce_cpu_notifier);
cpu_notifier_register_done();
goto err_device_create;
}
@@ -2471,10 +2476,6 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
- cpu_notifier_register_begin();
- __unregister_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
-
err_device_create:
/*
* We didn't keep track of which devices were created above, but
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 603df4f74640..1e49f8f41276 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
if (!b->interrupt_capable)
return -EINVAL;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
@@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 9a316b21df8b..3bdb95ae8c43 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -42,7 +42,7 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
-static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
@@ -144,14 +144,14 @@ static void cmci_storm_disable_banks(void)
int bank;
u64 val;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = __get_cpu_var(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void)
@@ -211,7 +211,7 @@ static void cmci_discover(int banks)
int i;
int bios_wrong_thresh = 0;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
int bios_zero_thresh = 0;
@@ -266,7 +266,7 @@ static void cmci_discover(int banks)
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) {
pr_info_once(
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
@@ -316,10 +316,10 @@ void cmci_clear(void)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++)
__cmci_disable_bank(i);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void cmci_rediscover_work_func(void *arg)
@@ -360,9 +360,9 @@ void cmci_disable_bank(int bank)
if (!cmci_supported(&banks))
return;
- spin_lock_irqsave(&cmci_discover_lock, flags);
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
__cmci_disable_bank(bank);
- spin_unlock_irqrestore(&cmci_discover_lock, flags);
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static void intel_init_cmci(void)
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 2bf616505499..e2b22df964cd 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -1,23 +1,25 @@
#!/bin/sh
#
-# Generate the x86_cap_flags[] array from include/asm/cpufeature.h
+# Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeature.h
#
IN=$1
OUT=$2
-TABS="$(printf '\t\t\t\t\t')"
-trap 'rm "$OUT"' EXIT
+function dump_array()
+{
+ ARRAY=$1
+ SIZE=$2
+ PFX=$3
+ POSTFIX=$4
-(
- echo "#ifndef _ASM_X86_CPUFEATURE_H"
- echo "#include <asm/cpufeature.h>"
- echo "#endif"
- echo ""
- echo "const char * const x86_cap_flags[NCAPINTS*32] = {"
+ PFX_SZ=$(echo $PFX | wc -c)
+ TABS="$(printf '\t\t\t\t\t')"
+
+ echo "const char * const $ARRAY[$SIZE] = {"
- # Iterate through any input lines starting with #define X86_FEATURE_
- sed -n -e 's/\t/ /g' -e 's/^ *# *define *X86_FEATURE_//p' $IN |
+ # Iterate through any input lines starting with #define $PFX
+ sed -n -e 's/\t/ /g' -e "s/^ *# *define *$PFX//p" $IN |
while read i
do
# Name is everything up to the first whitespace
@@ -31,11 +33,32 @@ trap 'rm "$OUT"' EXIT
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
- TABCOUNT=$(( ( 5*8 - 14 - $(echo "$NAME" | wc -c) ) / 8 ))
- printf "\t[%s]%.*s = %s,\n" \
- "X86_FEATURE_$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ if [ -n "$POSTFIX" ]; then
+ T=$(( $PFX_SZ + $(echo $POSTFIX | wc -c) + 2 ))
+ TABS="$(printf '\t\t\t\t\t\t')"
+ TABCOUNT=$(( ( 6*8 - ($T + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s - %s]%.*s = %s,\n" "$PFX$NAME" "$POSTFIX" "$TABCOUNT" "$TABS" "$VALUE"
+ else
+ TABCOUNT=$(( ( 5*8 - ($PFX_SZ + 1) - $(echo "$NAME" | wc -c) ) / 8 ))
+ printf "\t[%s]%.*s = %s,\n" "$PFX$NAME" "$TABCOUNT" "$TABS" "$VALUE"
+ fi
done
echo "};"
+}
+
+trap 'rm "$OUT"' EXIT
+
+(
+ echo "#ifndef _ASM_X86_CPUFEATURE_H"
+ echo "#include <asm/cpufeature.h>"
+ echo "#endif"
+ echo ""
+
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ echo ""
+
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2bdfbff8a4f6..2879ecdaac43 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -118,6 +118,9 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
continue;
if (event->attr.config1 & ~er->valid_mask)
return -EINVAL;
+ /* Check if the extra msrs can be safely accessed*/
+ if (!er->extra_msr_access)
+ return -ENXIO;
reg->idx = er->idx;
reg->config = event->attr.config1;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3b2f9bdd974b..8ade93111e03 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -295,14 +295,16 @@ struct extra_reg {
u64 config_mask;
u64 valid_mask;
int idx; /* per_xxx->regs[] reg index */
+ bool extra_msr_access;
};
#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
- .event = (e), \
- .msr = (ms), \
- .config_mask = (m), \
- .valid_mask = (vm), \
- .idx = EXTRA_REG_##i, \
+ .event = (e), \
+ .msr = (ms), \
+ .config_mask = (m), \
+ .valid_mask = (vm), \
+ .idx = EXTRA_REG_##i, \
+ .extra_msr_access = true, \
}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
index 3bbdf4cd38b9..30790d798e6b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c
@@ -294,31 +294,41 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
cpu_to_node(cpu));
}
-static void amd_uncore_cpu_up_prepare(unsigned int cpu)
+static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore;
+ struct amd_uncore *uncore_nb = NULL, *uncore_l2;
if (amd_uncore_nb) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_NB;
- uncore->rdpmc_base = RDPMC_BASE_NB;
- uncore->msr_base = MSR_F15H_NB_PERF_CTL;
- uncore->active_mask = &amd_nb_active_mask;
- uncore->pmu = &amd_nb_pmu;
- *per_cpu_ptr(amd_uncore_nb, cpu) = uncore;
+ uncore_nb = amd_uncore_alloc(cpu);
+ if (!uncore_nb)
+ goto fail;
+ uncore_nb->cpu = cpu;
+ uncore_nb->num_counters = NUM_COUNTERS_NB;
+ uncore_nb->rdpmc_base = RDPMC_BASE_NB;
+ uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
+ uncore_nb->active_mask = &amd_nb_active_mask;
+ uncore_nb->pmu = &amd_nb_pmu;
+ *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_l2) {
- uncore = amd_uncore_alloc(cpu);
- uncore->cpu = cpu;
- uncore->num_counters = NUM_COUNTERS_L2;
- uncore->rdpmc_base = RDPMC_BASE_L2;
- uncore->msr_base = MSR_F16H_L2I_PERF_CTL;
- uncore->active_mask = &amd_l2_active_mask;
- uncore->pmu = &amd_l2_pmu;
- *per_cpu_ptr(amd_uncore_l2, cpu) = uncore;
+ uncore_l2 = amd_uncore_alloc(cpu);
+ if (!uncore_l2)
+ goto fail;
+ uncore_l2->cpu = cpu;
+ uncore_l2->num_counters = NUM_COUNTERS_L2;
+ uncore_l2->rdpmc_base = RDPMC_BASE_L2;
+ uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL;
+ uncore_l2->active_mask = &amd_l2_active_mask;
+ uncore_l2->pmu = &amd_l2_pmu;
+ *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2;
}
+
+ return 0;
+
+fail:
+ kfree(uncore_nb);
+ return -ENOMEM;
}
static struct amd_uncore *
@@ -441,7 +451,7 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
if (!--uncore->refcnt)
kfree(uncore);
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
+ *per_cpu_ptr(uncores, cpu) = NULL;
}
static void amd_uncore_cpu_dead(unsigned int cpu)
@@ -461,7 +471,8 @@ amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action,
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- amd_uncore_cpu_up_prepare(cpu);
+ if (amd_uncore_cpu_up_prepare(cpu))
+ return notifier_from_errno(-ENOMEM);
break;
case CPU_STARTING:
@@ -501,20 +512,33 @@ static void __init init_cpu_already_online(void *dummy)
amd_uncore_cpu_online(cpu);
}
+static void cleanup_cpu_online(void *dummy)
+{
+ unsigned int cpu = smp_processor_id();
+
+ amd_uncore_cpu_dead(cpu);
+}
+
static int __init amd_uncore_init(void)
{
- unsigned int cpu;
+ unsigned int cpu, cpu2;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
- return -ENODEV;
+ goto fail_nodev;
if (!cpu_has_topoext)
- return -ENODEV;
+ goto fail_nodev;
if (cpu_has_perfctr_nb) {
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (!amd_uncore_nb) {
+ ret = -ENOMEM;
+ goto fail_nb;
+ }
+ ret = perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1);
+ if (ret)
+ goto fail_nb;
printk(KERN_INFO "perf: AMD NB counters detected\n");
ret = 0;
@@ -522,20 +546,28 @@ static int __init amd_uncore_init(void)
if (cpu_has_perfctr_l2) {
amd_uncore_l2 = alloc_percpu(struct amd_uncore *);
- perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (!amd_uncore_l2) {
+ ret = -ENOMEM;
+ goto fail_l2;
+ }
+ ret = perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1);
+ if (ret)
+ goto fail_l2;
printk(KERN_INFO "perf: AMD L2I counters detected\n");
ret = 0;
}
if (ret)
- return -ENODEV;
+ goto fail_nodev;
cpu_notifier_register_begin();
/* init cpus already online before registering for hotplug notifier */
for_each_online_cpu(cpu) {
- amd_uncore_cpu_up_prepare(cpu);
+ ret = amd_uncore_cpu_up_prepare(cpu);
+ if (ret)
+ goto fail_online;
smp_call_function_single(cpu, init_cpu_already_online, NULL, 1);
}
@@ -543,5 +575,30 @@ static int __init amd_uncore_init(void)
cpu_notifier_register_done();
return 0;
+
+
+fail_online:
+ for_each_online_cpu(cpu2) {
+ if (cpu2 == cpu)
+ break;
+ smp_call_function_single(cpu, cleanup_cpu_online, NULL, 1);
+ }
+ cpu_notifier_register_done();
+
+ /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */
+ amd_uncore_nb = amd_uncore_l2 = NULL;
+ if (cpu_has_perfctr_l2)
+ perf_pmu_unregister(&amd_l2_pmu);
+fail_l2:
+ if (cpu_has_perfctr_nb)
+ perf_pmu_unregister(&amd_nb_pmu);
+ if (amd_uncore_l2)
+ free_percpu(amd_uncore_l2);
+fail_nb:
+ if (amd_uncore_nb)
+ free_percpu(amd_uncore_nb);
+
+fail_nodev:
+ return ret;
}
device_initcall(amd_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index adb02aa62af5..2502d0d9d246 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1382,6 +1382,15 @@ again:
intel_pmu_lbr_read();
/*
+ * CondChgd bit 63 doesn't mean any overflow status. Ignore
+ * and clear the bit.
+ */
+ if (__test_and_clear_bit(63, (unsigned long *)&status)) {
+ if (!status)
+ goto done;
+ }
+
+ /*
* PEBS overflow sets bit 62 in the global status register
*/
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
@@ -2173,6 +2182,41 @@ static void intel_snb_check_microcode(void)
}
}
+/*
+ * Under certain circumstances, access certain MSR may cause #GP.
+ * The function tests if the input MSR can be safely accessed.
+ */
+static bool check_msr(unsigned long msr, u64 mask)
+{
+ u64 val_old, val_new, val_tmp;
+
+ /*
+ * Read the current value, change it and read it back to see if it
+ * matches, this is needed to detect certain hardware emulators
+ * (qemu/kvm) that don't trap on the MSR access and always return 0s.
+ */
+ if (rdmsrl_safe(msr, &val_old))
+ return false;
+
+ /*
+ * Only change the bits which can be updated by wrmsrl.
+ */
+ val_tmp = val_old ^ mask;
+ if (wrmsrl_safe(msr, val_tmp) ||
+ rdmsrl_safe(msr, &val_new))
+ return false;
+
+ if (val_new != val_tmp)
+ return false;
+
+ /* Here it's sure that the MSR can be safely accessed.
+ * Restore the old value and return.
+ */
+ wrmsrl(msr, val_old);
+
+ return true;
+}
+
static __init void intel_sandybridge_quirk(void)
{
x86_pmu.check_microcode = intel_snb_check_microcode;
@@ -2262,7 +2306,8 @@ __init int intel_pmu_init(void)
union cpuid10_ebx ebx;
struct event_constraint *c;
unsigned int unused;
- int version;
+ struct extra_reg *er;
+ int version, i;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -2465,6 +2510,9 @@ __init int intel_pmu_init(void)
case 62: /* IvyBridge EP */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ /* dTLB-load-misses on IVB is different than SNB */
+ hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */
+
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
sizeof(hw_cache_extra_regs));
@@ -2565,6 +2613,34 @@ __init int intel_pmu_init(void)
}
}
+ /*
+ * Access LBR MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support LBR MSR
+ * Check all LBT MSR here.
+ * Disable LBR access if any LBR MSRs can not be accessed.
+ */
+ if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL))
+ x86_pmu.lbr_nr = 0;
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) &&
+ check_msr(x86_pmu.lbr_to + i, 0xffffUL)))
+ x86_pmu.lbr_nr = 0;
+ }
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (x86_pmu.extra_regs) {
+ for (er = x86_pmu.extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+ }
+
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
x86_pmu.max_period = x86_pmu.cntval_mask;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 980970cb744d..696ade311ded 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -311,9 +311,11 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node);
- if (unlikely(!buffer))
+ buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ if (unlikely(!buffer)) {
+ WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
+ }
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
thresh = max / 16;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 65bbbea38b9c..cfc6f9dfcd90 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -550,16 +550,16 @@ static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
@@ -1222,6 +1222,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+
SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
@@ -1245,7 +1246,7 @@ static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
@@ -2946,10 +2947,7 @@ again:
* extra registers. If we failed to take an extra
* register, try the alternative.
*/
- if (idx % 2)
- idx--;
- else
- idx++;
+ idx ^= 1;
if (idx != reg1->idx % 6) {
if (idx == 2)
config1 >>= 8;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 06fe3ed8b851..5433658e598d 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -97,6 +97,14 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+ seq_printf(m, "\nbugs\t\t:");
+ for (i = 0; i < 32*NBUGINTS; i++) {
+ unsigned int bug_bit = 32*NCAPINTS + i;
+
+ if (cpu_has_bug(c, bug_bit) && x86_bug_flags[i])
+ seq_printf(m, " %s", x86_bug_flags[i]);
+ }
+
seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
c->loops_per_jiffy/(500000/HZ),
(c->loops_per_jiffy/(5000/HZ)) % 100);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index b6f794aa1693..4a8013d55947 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -38,7 +38,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 507de8066594..0553a34fa0df 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -4,9 +4,14 @@
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +21,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -28,6 +34,45 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+/* This primarily represents number of split ranges due to exclusion */
+#define CRASH_MAX_RANGES 16
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[CRASH_MAX_RANGES];
+};
+
+/* Misc data about ram ranges needed to prepare elf headers */
+struct crash_elf_data {
+ struct kimage *image;
+ /*
+ * Total number of ram ranges we have after various adjustments for
+ * GART, crash reserved region etc.
+ */
+ unsigned int max_nr_ranges;
+ unsigned long gart_start, gart_end;
+
+ /* Pointer to elf header */
+ void *ehdr;
+ /* Pointer to next phdr */
+ void *bufp;
+ struct crash_mem mem;
+};
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
int in_crash_kexec;
/*
@@ -39,6 +84,7 @@ int in_crash_kexec;
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
+
+#ifdef CONFIG_X86_64
+
+static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+ unsigned long nr_pfn, void *arg)
+{
+ int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+
+ ced->gart_start = start;
+ ced->gart_end = end;
+
+ /* Not expecting more than 1 gart aperture */
+ return 1;
+}
+
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static void fill_up_crash_elf_data(struct crash_elf_data *ced,
+ struct kimage *image)
+{
+ unsigned int nr_ranges = 0;
+
+ ced->image = image;
+
+ walk_system_ram_range(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+
+ ced->max_nr_ranges = nr_ranges;
+
+ /*
+ * We don't create ELF headers for GART aperture as an attempt
+ * to dump this memory in second kernel leads to hang/crash.
+ * If gart aperture is present, one needs to exclude that region
+ * and that could lead to need of extra phdr.
+ */
+ walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
+ ced, get_gart_ranges_callback);
+
+ /*
+ * If we have gart region, excluding that could potentially split
+ * a memory range, resulting in extra header. Account for that.
+ */
+ if (ced->gart_end)
+ ced->max_nr_ranges++;
+
+ /* Exclusion of crash region could split memory ranges */
+ ced->max_nr_ranges++;
+
+ /* If crashk_low_res is not 0, another range split possible */
+ if (crashk_low_res.end != 0)
+ ced->max_nr_ranges++;
+}
+
+static int exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happend, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == CRASH_MAX_RANGES - 1) {
+ pr_err("Too many crash ranges after split\n");
+ return -ENOMEM;
+ }
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in ced->mem.ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_elf_data *ced,
+ unsigned long long mstart, unsigned long long mend)
+{
+ struct crash_mem *cmem = &ced->mem;
+ int ret = 0;
+
+ memset(cmem->ranges, 0, sizeof(cmem->ranges));
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude crashkernel region */
+ ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ if (ret)
+ return ret;
+
+ /* Exclude GART region */
+ if (ced->gart_end) {
+ ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long mstart, mend;
+ struct kimage *image = ced->image;
+ struct crash_mem *cmem;
+ int ret, i;
+
+ ehdr = ced->ehdr;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(ced, start, end);
+ if (ret)
+ return ret;
+
+ /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
+ cmem = &ced->mem;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ mstart = cmem->ranges[i].start;
+ mend = cmem->ranges[i].end;
+
+ phdr = ced->bufp;
+ ced->bufp += sizeof(Elf64_Phdr);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ if (mstart == image->arch.backup_src_start &&
+ (mend - mstart + 1) == image->arch.backup_src_sz)
+ phdr->p_offset = image->arch.backup_load_addr;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_headers(struct crash_elf_data *ced,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf, *bufp;
+ unsigned int cpu;
+ unsigned long long notes_addr;
+ int ret;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += ced->max_nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ bufp = buf;
+ ehdr = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+ (ehdr->e_phnum)++;
+
+#ifdef CONFIG_X86_64
+ /* Prepare PT_LOAD type program header for kernel text region */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ (ehdr->e_phnum)++;
+#endif
+
+ /* Prepare PT_LOAD headers for system ram chunks. */
+ ced->ehdr = ehdr;
+ ced->bufp = bufp;
+ ret = walk_system_ram_res(0, -1, ced,
+ prepare_elf64_ram_headers_callback);
+ if (ret < 0)
+ return ret;
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_elf_data *ced;
+ int ret;
+
+ ced = kzalloc(sizeof(*ced), GFP_KERNEL);
+ if (!ced)
+ return -ENOMEM;
+
+ fill_up_crash_elf_data(ced, image);
+
+ /* By default prepare 64bit headers */
+ ret = prepare_elf64_headers(ced, addr, sz);
+ kfree(ced);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820MAX)
+ return 1;
+
+ memcpy(&params->e820_map[nr_e820_entries], entry,
+ sizeof(struct e820entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820entry ei;
+
+ ei.addr = start;
+ ei.size = end - start + 1;
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_NVS;
+ walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(u64 start, u64 end, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = start;
+ image->arch.backup_src_sz = end - start + 1;
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ unsigned long src_start, src_sz, elf_sz;
+ void *elf_addr;
+ int ret;
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ src_start = image->arch.backup_src_start;
+ src_sz = image->arch.backup_src_sz;
+
+ /* Add backup segment. */
+ if (src_sz) {
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
+ sizeof(crash_zero_bytes), src_sz,
+ PAGE_SIZE, 0, -1, 0,
+ &image->arch.backup_load_addr);
+ if (ret)
+ return ret;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr, src_start, src_sz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = elf_addr;
+ image->arch.elf_headers_sz = elf_sz;
+
+ ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
+ ELF_CORE_HEADER_ALIGN, 0, -1, 0,
+ &image->arch.elf_load_addr);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, elf_sz, elf_sz);
+
+ return ret;
+}
+
+#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f0da82b8e634..47c410d99f5d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -423,8 +423,9 @@ sysenter_past_esp:
jnz sysenter_audit
sysenter_do_call:
cmpl $(NR_syscalls), %eax
- jae syscall_badsys
+ jae sysenter_badsys
call *sys_call_table(,%eax,4)
+sysenter_after_call:
movl %eax,PT_EAX(%esp)
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
@@ -501,6 +502,7 @@ ENTRY(system_call)
jae syscall_badsys
syscall_call:
call *sys_call_table(,%eax,4)
+syscall_after_call:
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
@@ -674,8 +676,13 @@ syscall_fault:
END(syscall_fault)
syscall_badsys:
- movl $-ENOSYS,PT_EAX(%esp)
- jmp resume_userspace
+ movl $-ENOSYS,%eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS,%eax
+ jmp sysenter_after_call
END(syscall_badsys)
CFI_ENDPROC
@@ -1052,9 +1059,6 @@ ENTRY(mcount)
END(mcount)
ENTRY(ftrace_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
@@ -1086,8 +1090,6 @@ END(ftrace_caller)
ENTRY(ftrace_regs_caller)
pushf /* push flags before compare (in cs location) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
/*
* i386 does not save SS and ESP when coming from kernel.
@@ -1146,7 +1148,6 @@ GLOBAL(ftrace_regs_call)
popf /* Pop flags at end (no addl to corrupt flags) */
jmp ftrace_ret
-ftrace_restore_flags:
popf
jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
@@ -1155,9 +1156,6 @@ ENTRY(mcount)
cmpl $__PAGE_OFFSET, %esp
jb ftrace_stub /* Paging not enabled yet? */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpl $ftrace_stub, ftrace_trace_function
jnz trace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b25ca969edd2..2fac1343a90b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -207,7 +207,6 @@ ENDPROC(native_usergs_sysret64)
*/
.macro XCPT_FRAME start=1 offset=0
INTR_FRAME \start, RIP+\offset-ORIG_RAX
- /*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
.endm
/*
@@ -287,21 +286,21 @@ ENDPROC(native_usergs_sysret64)
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
movq_cfi rdx, RDX+8
movq_cfi rcx, RCX+8
movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
movl $1,%ebx
movl $MSR_GS_BASE,%ecx
rdmsr
@@ -830,27 +829,24 @@ restore_args:
RESTORE_ARGS 1,8,1
irq_return:
+ INTERRUPT_RETURN
+
+ENTRY(native_iret)
/*
* Are we returning to a stack segment from the LDT? Note: in
* 64-bit mode SS:RSP on the exception stack is always valid.
*/
#ifdef CONFIG_X86_ESPFIX64
testb $4,(SS-RIP)(%rsp)
- jnz irq_return_ldt
+ jnz native_irq_return_ldt
#endif
-irq_return_iret:
- INTERRUPT_RETURN
- _ASM_EXTABLE(irq_return_iret, bad_iret)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
+native_irq_return_iret:
iretq
- _ASM_EXTABLE(native_iret, bad_iret)
-#endif
+ _ASM_EXTABLE(native_irq_return_iret, bad_iret)
#ifdef CONFIG_X86_ESPFIX64
-irq_return_ldt:
+native_irq_return_ldt:
pushq_cfi %rax
pushq_cfi %rdi
SWAPGS
@@ -872,7 +868,7 @@ irq_return_ldt:
SWAPGS
movq %rax,%rsp
popq_cfi %rax
- jmp irq_return_iret
+ jmp native_irq_return_iret
#endif
.section .fixup,"ax"
@@ -956,13 +952,8 @@ __do_double_fault:
cmpl $__KERNEL_CS,CS(%rdi)
jne do_double_fault
movq RIP(%rdi),%rax
- cmpq $irq_return_iret,%rax
-#ifdef CONFIG_PARAVIRT
- je 1f
- cmpq $native_iret,%rax
-#endif
+ cmpq $native_irq_return_iret,%rax
jne do_double_fault /* This shouldn't happen... */
-1:
movq PER_CPU_VAR(kernel_stack),%rax
subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */
movq %rax,RSP(%rdi)
@@ -1395,21 +1386,21 @@ ENTRY(error_entry)
CFI_ADJUST_CFA_OFFSET 15*8
/* oldrax contains error code */
cld
- movq_cfi rdi, RDI+8
- movq_cfi rsi, RSI+8
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq_cfi r8, R8+8
- movq_cfi r9, R9+8
- movq_cfi r10, R10+8
- movq_cfi r11, R11+8
+ movq %rdi, RDI+8(%rsp)
+ movq %rsi, RSI+8(%rsp)
+ movq %rdx, RDX+8(%rsp)
+ movq %rcx, RCX+8(%rsp)
+ movq %rax, RAX+8(%rsp)
+ movq %r8, R8+8(%rsp)
+ movq %r9, R9+8(%rsp)
+ movq %r10, R10+8(%rsp)
+ movq %r11, R11+8(%rsp)
movq_cfi rbx, RBX+8
- movq_cfi rbp, RBP+8
- movq_cfi r12, R12+8
- movq_cfi r13, R13+8
- movq_cfi r14, R14+8
- movq_cfi r15, R15+8
+ movq %rbp, RBP+8(%rsp)
+ movq %r12, R12+8(%rsp)
+ movq %r13, R13+8(%rsp)
+ movq %r14, R14+8(%rsp)
+ movq %r15, R15+8(%rsp)
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1427,8 +1418,9 @@ error_sti:
* compat mode. Check for these here too.
*/
error_kernelspace:
+ CFI_REL_OFFSET rcx, RCX+8
incl %ebx
- leaq irq_return_iret(%rip),%rcx
+ leaq native_irq_return_iret(%rip),%rcx
cmpq %rcx,RIP+8(%rsp)
je error_swapgs
movl %ecx,%eax /* zero extend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 6afbb16e9b79..94d857fb1033 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -175,7 +175,7 @@ void init_espfix_ap(void)
if (!pud_present(pud)) {
pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
set_pud(&pud_p[n], pud);
}
@@ -185,7 +185,7 @@ void init_espfix_ap(void)
if (!pmd_present(pmd)) {
pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
- paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
set_pmd(&pmd_p[n], pmd);
}
@@ -193,7 +193,6 @@ void init_espfix_ap(void)
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = (void *)__get_free_page(GFP_KERNEL);
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
- paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cbc4a91b131e..3386dc9aa333 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -703,6 +703,9 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
unsigned long return_hooker = (unsigned long)
&return_to_handler;
+ if (unlikely(ftrace_graph_is_dead()))
+ return;
+
if (unlikely(atomic_read(&current->tracing_graph_pause)))
return;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd80814419..a9a4229f6161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* These bits must be zero.
*/
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..9642b9b33655
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,553 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verify_pefile.h>
+#include <keys/system_keyring.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ memcpy(cmdline_ptr, cmdline, cmdline_len);
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr + cmdline_len - 1,
+ " elfcorehdr=0x%lx", image->arch.elf_load_addr);
+ cmdline_len += len;
+ }
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_saved.nr_map;
+
+ /* TODO: Pass entries more than E820MAX in bootparams setup data */
+ if (nr_e820_entries > E820MAX)
+ nr_e820_entries = E820MAX;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_map, &e820_saved.map,
+ nr_e820_entries * sizeof(struct e820entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ /* Default sysdesc table */
+ params->sys_desc_table.length = 0;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_map[i].type != E820_RAM)
+ continue;
+ start = params->e820_map[i].addr;
+ end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be atleast two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ unsigned long purgatory_load_addr;
+ unsigned long kernel_bufsz, kernel_memsz, kernel_align;
+ char *kernel_buf;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
+ &purgatory_load_addr);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ efi_map_sz = ALIGN(efi_map_sz, 16);
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ params_misc_sz = params_cmdline_sz + efi_map_sz +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(params_misc_sz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + efi_map_sz;
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
+ params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
+ ULONG_MAX, 1, &bootparam_load_addr);
+ if (ret)
+ goto out_free_params;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, params_misc_sz, params_misc_sz);
+
+ /* Load kernel */
+ kernel_buf = kernel + kern16_size;
+ kernel_bufsz = kernel_len - kern16_size;
+ kernel_memsz = PAGE_ALIGN(header->init_size);
+ kernel_align = header->kernel_alignment;
+
+ ret = kexec_add_buffer(image, kernel_buf,
+ kernel_bufsz, kernel_memsz, kernel_align,
+ MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
+ &kernel_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kernel_memsz, kernel_memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
+ PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
+ ULONG_MAX, 1, &initrd_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ bool trusted;
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ system_trusted_keyring, &trusted);
+ if (ret < 0)
+ return ret;
+ if (!trusted)
+ return -EKEYREJECTED;
+ return 0;
+}
+#endif
+
+struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 7596df664901..67e6d19ef1be 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -574,6 +574,9 @@ int kprobe_int3_handler(struct pt_regs *regs)
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ if (user_mode_vm(regs))
+ return 0;
+
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
/*
* We don't want to be preempted for the entire
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791cd..8b04018e5d1f 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -21,6 +23,11 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+
+static struct kexec_file_ops *kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+};
static void free_transition_pgtable(struct kimage *image)
{
@@ -171,6 +178,38 @@ static void load_segments(void)
);
}
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
return 0;
}
@@ -283,3 +327,198 @@ void arch_crash_save_vmcoreinfo(void)
(unsigned long)&_text - __START_KERNEL);
}
+/* arch-dependent functionality related to kexec file-based syscall */
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ int i, ret = -ENOEXEC;
+ struct kexec_file_ops *fops;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+ fops = kexec_file_loaders[i];
+ if (!fops || !fops->probe)
+ continue;
+
+ ret = fops->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
+ unsigned long kernel_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(kernel, kernel_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * ehdr: Pointer to elf headers
+ * sechdrs: Pointer to section headers.
+ * relsec: section index of SHT_RELA section.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
+ Elf64_Shdr *sechdrs, unsigned int relsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ Elf64_Shdr *section, *symtabsec;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+
+ /*
+ * ->sh_offset has been modified to keep the pointer to section
+ * contents in memory
+ */
+ rel = (void *)sechdrs[relsec].sh_offset;
+
+ /* Section to which relocations apply */
+ section = &sechdrs[sechdrs[relsec].sh_info];
+
+ pr_debug("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+
+ /* Associated symbol table */
+ symtabsec = &sechdrs[sechdrs[relsec].sh_link];
+
+ /* String table */
+ if (symtabsec->sh_link >= ehdr->e_shnum) {
+ /* Invalid strtab section number */
+ pr_err("Invalid string table section index %d\n",
+ symtabsec->sh_link);
+ return -ENOEXEC;
+ }
+
+ strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+
+ /* section header string table */
+ shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update (->sh_offset). This is temporary
+ * buffer where section is currently loaded. This will finally
+ * be loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = (void *)(section->sh_offset + rel[i].r_offset);
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (Elf64_Sym *)symtabsec->sh_offset +
+ ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index c050a0153168..c73aecf10d34 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -46,10 +46,6 @@ END(function_hook)
.endm
ENTRY(ftrace_caller)
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
ftrace_caller_setup
/* regs go into 4th parameter (but make it NULL) */
movq $0, %rcx
@@ -73,10 +69,6 @@ ENTRY(ftrace_regs_caller)
/* Save the current flags before compare (in SS location)*/
pushfq
- /* Check if tracing was disabled (quick check) */
- cmpl $0, function_trace_stop
- jne ftrace_restore_flags
-
/* skip=8 to skip flags saved in SS */
ftrace_caller_setup 8
@@ -131,7 +123,7 @@ GLOBAL(ftrace_regs_call)
popfq
jmp ftrace_return
-ftrace_restore_flags:
+
popfq
jmp ftrace_stub
@@ -141,9 +133,6 @@ END(ftrace_regs_caller)
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(function_hook)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
cmpq $ftrace_stub, ftrace_trace_function
jnz trace
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34f93eb..a1da6737ba5b 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -6,7 +6,6 @@ DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
-DEF_NATIVE(pv_cpu_ops, iret, "iretq");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -50,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_irq_ops, save_fl);
PATCH_SITE(pv_irq_ops, irq_enable);
PATCH_SITE(pv_irq_ops, irq_disable);
- PATCH_SITE(pv_cpu_ops, iret);
PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
PATCH_SITE(pv_cpu_ops, usergs_sysret32);
PATCH_SITE(pv_cpu_ops, usergs_sysret64);
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
new file mode 100644
index 000000000000..0c424a67985d
--- /dev/null
+++ b/arch/x86/kernel/pmc_atom.c
@@ -0,0 +1,321 @@
+/*
+ * Intel Atom SOC Power Management Controller Driver
+ * Copyright (c) 2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+
+#include <asm/pmc_atom.h>
+
+#define DRIVER_NAME KBUILD_MODNAME
+
+struct pmc_dev {
+ u32 base_addr;
+ void __iomem *regmap;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *dbgfs_dir;
+#endif /* CONFIG_DEBUG_FS */
+};
+
+static struct pmc_dev pmc_device;
+static u32 acpi_base_addr;
+
+struct pmc_dev_map {
+ const char *name;
+ u32 bit_mask;
+};
+
+static const struct pmc_dev_map dev_map[] = {
+ {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
+ {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
+ {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
+ {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
+ {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
+ {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
+ {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
+ {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
+ {"8 - SCC_EMMC", BIT_SCC_EMMC},
+ {"9 - SCC_SDIO", BIT_SCC_SDIO},
+ {"10 - SCC_SDCARD", BIT_SCC_SDCARD},
+ {"11 - SCC_MIPI", BIT_SCC_MIPI},
+ {"12 - HDA", BIT_HDA},
+ {"13 - LPE", BIT_LPE},
+ {"14 - OTG", BIT_OTG},
+ {"15 - USH", BIT_USH},
+ {"16 - GBE", BIT_GBE},
+ {"17 - SATA", BIT_SATA},
+ {"18 - USB_EHCI", BIT_USB_EHCI},
+ {"19 - SEC", BIT_SEC},
+ {"20 - PCIE_PORT0", BIT_PCIE_PORT0},
+ {"21 - PCIE_PORT1", BIT_PCIE_PORT1},
+ {"22 - PCIE_PORT2", BIT_PCIE_PORT2},
+ {"23 - PCIE_PORT3", BIT_PCIE_PORT3},
+ {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
+ {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
+ {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
+ {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
+ {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
+ {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
+ {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
+ {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
+ {"32 - SMB", BIT_SMB},
+ {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY},
+ {"34 - USH_SS_PHY", BIT_USH_SS_PHY},
+ {"35 - DFX", BIT_DFX},
+};
+
+static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
+{
+ return readl(pmc->regmap + reg_offset);
+}
+
+static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
+{
+ writel(val, pmc->regmap + reg_offset);
+}
+
+static void pmc_power_off(void)
+{
+ u16 pm1_cnt_port;
+ u32 pm1_cnt_value;
+
+ pr_info("Preparing to enter system sleep state S5\n");
+
+ pm1_cnt_port = acpi_base_addr + PM1_CNT;
+
+ pm1_cnt_value = inl(pm1_cnt_port);
+ pm1_cnt_value &= SLEEP_TYPE_MASK;
+ pm1_cnt_value |= SLEEP_TYPE_S5;
+ pm1_cnt_value |= SLEEP_ENABLE;
+
+ outl(pm1_cnt_value, pm1_cnt_port);
+}
+
+static void pmc_hw_reg_setup(struct pmc_dev *pmc)
+{
+ /*
+ * Disable PMC S0IX_WAKE_EN events coming from:
+ * - LPC clock run
+ * - GPIO_SUS ored dedicated IRQs
+ * - GPIO_SCORE ored dedicated IRQs
+ * - GPIO_SUS shared IRQ
+ * - GPIO_SCORE shared IRQ
+ */
+ pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
+}
+
+#ifdef CONFIG_DEBUG_FS
+static int pmc_dev_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 func_dis, func_dis_2, func_dis_index;
+ u32 d3_sts_0, d3_sts_1, d3_sts_index;
+ int dev_num, dev_index, reg_index;
+
+ func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
+ func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
+ d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
+ d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
+
+ dev_num = ARRAY_SIZE(dev_map);
+
+ for (dev_index = 0; dev_index < dev_num; dev_index++) {
+ reg_index = dev_index / PMC_REG_BIT_WIDTH;
+ if (reg_index) {
+ func_dis_index = func_dis_2;
+ d3_sts_index = d3_sts_1;
+ } else {
+ func_dis_index = func_dis;
+ d3_sts_index = d3_sts_0;
+ }
+
+ seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
+ dev_map[dev_index].name,
+ dev_map[dev_index].bit_mask & func_dis_index ?
+ "Disabled" : "Enabled ",
+ dev_map[dev_index].bit_mask & d3_sts_index ?
+ "D3" : "D0");
+ }
+ return 0;
+}
+
+static int pmc_dev_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_dev_state_ops = {
+ .open = pmc_dev_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
+
+ s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
+ s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
+ s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
+ s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
+ s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
+
+ seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
+ seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
+ seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
+ seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
+ seq_printf(s, "S0 Residency:\t%lldus\n", s0_tmr);
+ return 0;
+}
+
+static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_sleep_tmr_show, inode->i_private);
+}
+
+static const struct file_operations pmc_sleep_tmr_ops = {
+ .open = pmc_sleep_tmr_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
+{
+ if (!pmc->dbgfs_dir)
+ return;
+
+ debugfs_remove_recursive(pmc->dbgfs_dir);
+ pmc->dbgfs_dir = NULL;
+}
+
+static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+{
+ struct dentry *dir, *f;
+
+ dir = debugfs_create_dir("pmc_atom", NULL);
+ if (!dir)
+ return -ENOMEM;
+
+ f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_dev_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "dev_states register failed\n");
+ goto err;
+ }
+ f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_sleep_tmr_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "sleep_state register failed\n");
+ goto err;
+ }
+ pmc->dbgfs_dir = dir;
+ return 0;
+err:
+ pmc_dbgfs_unregister(pmc);
+ return -ENODEV;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+static int pmc_setup_dev(struct pci_dev *pdev)
+{
+ struct pmc_dev *pmc = &pmc_device;
+ int ret;
+
+ /* Obtain ACPI base address */
+ pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
+ acpi_base_addr &= ACPI_BASE_ADDR_MASK;
+
+ /* Install power off function */
+ if (acpi_base_addr != 0 && pm_power_off == NULL)
+ pm_power_off = pmc_power_off;
+
+ pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
+ pmc->base_addr &= PMC_BASE_ADDR_MASK;
+
+ pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
+ if (!pmc->regmap) {
+ dev_err(&pdev->dev, "error: ioremap failed\n");
+ return -ENOMEM;
+ }
+
+ /* PMC hardware registers setup */
+ pmc_hw_reg_setup(pmc);
+
+#ifdef CONFIG_DEBUG_FS
+ ret = pmc_dbgfs_register(pmc, pdev);
+ if (ret) {
+ iounmap(pmc->regmap);
+ return ret;
+ }
+#endif /* CONFIG_DEBUG_FS */
+ return 0;
+}
+
+/*
+ * Data for PCI driver interface
+ *
+ * This data only exists for exporting the supported
+ * PCI ids via MODULE_DEVICE_TABLE. We do not actually
+ * register a pci_driver, because lpc_ich will register
+ * a driver on the same PCI id.
+ */
+static const struct pci_device_id pmc_pci_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+ { 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
+
+static int __init pmc_atom_init(void)
+{
+ int err = -ENODEV;
+ struct pci_dev *pdev = NULL;
+ const struct pci_device_id *ent;
+
+ /* We look for our device - PCU PMC
+ * we assume that there is max. one device.
+ *
+ * We can't use plain pci_driver mechanism,
+ * as the device is really a multiple function device,
+ * main driver that binds to the pci_device is lpc_ich
+ * and have to find & bind to the device this way.
+ */
+ for_each_pci_dev(pdev) {
+ ent = pci_match_id(pmc_pci_ids, pdev);
+ if (ent) {
+ err = pmc_setup_dev(pdev);
+ goto out;
+ }
+ }
+ /* Device not found. */
+out:
+ return err;
+}
+
+module_init(pmc_atom_init);
+/* no module_exit, this driver shouldn't be unloaded */
+
+MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
+MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..f804dc935d2a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -93,6 +93,7 @@ void arch_task_cache_init(void)
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
SLAB_PANIC | SLAB_NOTRACK, NULL);
+ setup_xstate_comp();
}
/*
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
index 2a26819bb6a8..80eab01c1a68 100644
--- a/arch/x86/kernel/resource.c
+++ b/arch/x86/kernel/resource.c
@@ -37,10 +37,12 @@ static void remove_e820_regions(struct resource *avail)
void arch_remove_reservations(struct resource *avail)
{
- /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ /*
+ * Trim out BIOS area (high 2MB) and E820 regions. We do not remove
+ * the low 1MB unconditionally, as this area is needed for some ISA
+ * cards requiring a memory range, e.g. the i82365 PCMCIA controller.
+ */
if (avail->flags & IORESOURCE_MEM) {
- if (avail->start < BIOS_END)
- avail->start = BIOS_END;
resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
remove_e820_regions(avail);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index a0da58db43a8..2851d63c1202 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace. */
restorer = current->mm->context.vdso +
- selected_vdso32->sym___kernel_sigreturn;
+ selected_vdso32->sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
put_user_ex(restorer, &frame->pretcode);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c6eb418c5627..0d0e922fafc1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -343,6 +343,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
+ prev_state = exception_enter();
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -351,9 +352,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
- return;
+ goto exit;
#endif
- prev_state = exception_enter();
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -433,6 +433,8 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
+ prev_state = exception_enter();
+
get_debugreg(dr6, 6);
/* Filter out all the reserved bits which are preset to 1 */
@@ -465,7 +467,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (kprobe_debug_handler(regs))
goto exit;
#endif
- prev_state = exception_enter();
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 57e5ce126d5a..b6025f9e36c6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -234,9 +234,6 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc)
return ns;
}
-/* XXX surely we already have this someplace in the kernel?! */
-#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
-
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
unsigned long long tsc_now, ns_now;
@@ -259,7 +256,9 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+ data->cyc2ns_mul =
+ DIV_ROUND_CLOSEST(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR,
+ cpu_khz);
data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
data->cyc2ns_offset = ns_now -
mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
@@ -920,9 +919,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- }
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu);
+ }
return 0;
}
@@ -951,7 +950,7 @@ core_initcall(cpufreq_tsc);
static struct clocksource clocksource_tsc;
/*
- * We compare the TSC to the cycle_last value in the clocksource
+ * We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
@@ -961,26 +960,23 @@ static struct clocksource clocksource_tsc;
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
* timer.
+ *
+ * This sanity check is now done in the core timekeeping code.
+ * checking the result of read_tsc() - cycle_last for being negative.
+ * That works because CLOCKSOURCE_MASK(64) does not mask out any bit.
*/
static cycle_t read_tsc(struct clocksource *cs)
{
- cycle_t ret = (cycle_t)get_cycles();
-
- return ret >= clocksource_tsc.cycle_last ?
- ret : clocksource_tsc.cycle_last;
-}
-
-static void resume_tsc(struct clocksource *cs)
-{
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
- clocksource_tsc.cycle_last = 0;
+ return (cycle_t)get_cycles();
}
+/*
+ * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
+ */
static struct clocksource clocksource_tsc = {
.name = "tsc",
.rating = 300,
.read = read_tsc,
- .resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index ea5b5709aa76..e1e1e80fc6a6 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -81,10 +81,10 @@ static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
if (!show_unhandled_signals)
return;
- pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
- level, current->comm, task_pid_nr(current),
- message, regs->ip, regs->cs,
- regs->sp, regs->ax, regs->si, regs->di);
+ printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+ level, current->comm, task_pid_nr(current),
+ message, regs->ip, regs->cs,
+ regs->sp, regs->ax, regs->si, regs->di);
}
static int addr_to_vsyscall_nr(unsigned long addr)
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index 9531fbb123ba..c7d791f32b98 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,29 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
gtod_write_begin(vdata);
/* copy vsyscall data */
- vdata->vclock_mode = tk->clock->archdata.vclock_mode;
- vdata->cycle_last = tk->clock->cycle_last;
- vdata->mask = tk->clock->mask;
- vdata->mult = tk->mult;
- vdata->shift = tk->shift;
+ vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode;
+ vdata->cycle_last = tk->tkr.cycle_last;
+ vdata->mask = tk->tkr.mask;
+ vdata->mult = tk->tkr.mult;
+ vdata->shift = tk->tkr.shift;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->wall_time_snsec = tk->xtime_nsec;
+ vdata->wall_time_snsec = tk->tkr.xtime_nsec;
vdata->monotonic_time_sec = tk->xtime_sec
+ tk->wall_to_monotonic.tv_sec;
- vdata->monotonic_time_snsec = tk->xtime_nsec
+ vdata->monotonic_time_snsec = tk->tkr.xtime_nsec
+ ((u64)tk->wall_to_monotonic.tv_nsec
- << tk->shift);
+ << tk->tkr.shift);
while (vdata->monotonic_time_snsec >=
- (((u64)NSEC_PER_SEC) << tk->shift)) {
+ (((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
vdata->monotonic_time_snsec -=
- ((u64)NSEC_PER_SEC) << tk->shift;
+ ((u64)NSEC_PER_SEC) << tk->tkr.shift;
vdata->monotonic_time_sec++;
}
vdata->wall_time_coarse_sec = tk->xtime_sec;
- vdata->wall_time_coarse_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
+ tk->tkr.shift);
vdata->monotonic_time_coarse_sec =
vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c6addf..940b142cc11f 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/compat.h>
+#include <linux/cpu.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
struct xsave_struct *init_xstate_buf;
static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
/*
* If a processor implementation discern that a processor state component is
@@ -283,7 +286,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
if (use_xsave()) {
/* These bits must be zero. */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
/*
* Init the state that is not present in the memory
@@ -479,6 +482,52 @@ static void __init setup_xstate_features(void)
}
/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+ if (!cpu_has_xsaves) {
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > 2)
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ }
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
@@ -496,15 +545,21 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
+ if (cpu_has_xsaves) {
+ init_xstate_buf->xsave_hdr.xcomp_bv =
+ (u64)1 << 63 | pcntxt_mask;
+ init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+ }
+
/*
* Init all the features state with header_bv being 0x0
*/
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state_booting(init_xstate_buf, -1);
/*
* Dump the init state again. This is to identify the init state
* of any feature which is not represented by all zero's.
*/
- xsave_state(init_xstate_buf, -1);
+ xsave_state_booting(init_xstate_buf, -1);
}
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +575,30 @@ static int __init eager_fpu_setup(char *s)
}
__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ int i;
+
+ if (!cpu_has_xsaves) {
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xstate_size = ebx;
+ return;
+ }
+
+ xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for (i = 2; i < 64; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ xstate_size += eax;
+ }
+ }
+}
+
/*
* Enable and initialize the xsave feature.
*/
@@ -551,8 +630,7 @@ static void __init xstate_enable_boot_cpu(void)
/*
* Recompute the context size for enabled features
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
+ init_xstate_size();
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
@@ -572,8 +650,9 @@ static void __init xstate_enable_boot_cpu(void)
}
}
- pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
+ pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+ pcntxt_mask, xstate_size,
+ cpu_has_xsaves ? "compacted form" : "standard form");
}
/*
@@ -635,3 +714,26 @@ void eager_fpu_init(void)
else
fxrstor_checking(&init_xstate_buf->i387);
}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ * xsave: base address of the xsave area;
+ * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ * etc.)
+ * Output:
+ * address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+ int feature = fls64(xstate) - 1;
+ if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+ return NULL;
+
+ return (void *)xsave + xstate_comp_offsets[feature];
+}
OpenPOWER on IntegriCloud