summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorH. Peter Anvin <hpa@linux.intel.com>2012-10-19 07:54:24 -0700
committerH. Peter Anvin <hpa@linux.intel.com>2012-10-19 07:55:09 -0700
commit4533d86270d7986e00594495dde9a109d6be27ae (patch)
treec2473cac653f7b98e5bd5e6475e63734be4b7644 /arch/x86/kernel
parent21c5e50e15b1abd797e62f18fd7f90b9cc004cbd (diff)
parent5bc66170dc486556a1e36fd384463536573f4b82 (diff)
downloadblackbird-op-linux-4533d86270d7986e00594495dde9a109d6be27ae.tar.gz
blackbird-op-linux-4533d86270d7986e00594495dde9a109d6be27ae.zip
Merge commit '5bc66170dc486556a1e36fd384463536573f4b82' into x86/urgent
From Borislav Petkov <bp@amd64.org>: Below is a RAS fix which reverts the addition of a sysfs attribute which we agreed is not needed, post-factum. And this should go in now because that sysfs attribute is going to end up in 3.7 otherwise and thus exposed to userspace; removing it then would be a lot harder. This is done as a merge rather than a simple patch/cherry-pick since the baseline for this patch was not in the previous x86/urgent. Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/alternative.c111
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c4
-rw-r--r--arch/x86/kernel/asm-offsets.c3
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c67
-rw-r--r--arch/x86/kernel/cpu/common.c17
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c88
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c168
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl5
-rw-r--r--arch/x86/kernel/cpu/perf_event.h4
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c73
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c27
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c3
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c36
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c248
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/proc.c5
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/devicetree.c51
-rw-r--r--arch/x86/kernel/entry_32.S120
-rw-r--r--arch/x86/kernel/entry_64.S221
-rw-r--r--arch/x86/kernel/ftrace.c73
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/kgdb.c2
-rw-r--r--arch/x86/kernel/kprobes.c67
-rw-r--r--arch/x86/kernel/kvm.c3
-rw-r--r--arch/x86/kernel/microcode_amd.c357
-rw-r--r--arch/x86/kernel/microcode_core.c70
-rw-r--r--arch/x86/kernel/microcode_intel.c3
-rw-r--r--arch/x86/kernel/msr.c5
-rw-r--r--arch/x86/kernel/perf_regs.c105
-rw-r--r--arch/x86/kernel/process.c65
-rw-r--r--arch/x86/kernel/process_32.c37
-rw-r--r--arch/x86/kernel/process_64.c35
-rw-r--r--arch/x86/kernel/ptrace.c5
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/step.c53
-rw-r--r--arch/x86/kernel/sys_i386_32.c40
-rw-r--r--arch/x86/kernel/traps.c169
-rw-r--r--arch/x86/kernel/uprobes.c52
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kernel/vsyscall_64.c49
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c6
-rw-r--r--arch/x86/kernel/x86_init.c4
56 files changed, 1827 insertions, 739 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8215e5652d97..91ce48f05f9f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,7 +23,7 @@ obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
-obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
+obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-y += syscall_$(BITS).o
obj-$(CONFIG_X86_64) += vsyscall_64.o
@@ -81,8 +81,7 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
-obj-$(CONFIG_KVM_GUEST) += kvm.o
-obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
@@ -100,6 +99,8 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
+obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2297e58c6ed..e651f7a589ac 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
acpi_register_lapic(physid, ACPI_MADT_ENABLED);
/*
- * If mp_register_lapic successfully generates a new logical cpu
+ * If acpi_register_lapic successfully generates a new logical cpu
* number, then the following will get us exactly what was mapped
*/
cpumask_andnot(new_map, cpu_present_mask, tmp_map);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ced4534baed5..ef5ccca79a6c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,19 +23,6 @@
#define MAX_PATCH_LEN (255-1)
-#ifdef CONFIG_HOTPLUG_CPU
-static int smp_alt_once;
-
-static int __init bootonly(char *str)
-{
- smp_alt_once = 1;
- return 1;
-}
-__setup("smp-alt-boot", bootonly);
-#else
-#define smp_alt_once 1
-#endif
-
static int __initdata_or_module debug_alternative;
static int __init debug_alt(char *str)
@@ -317,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
/* turn DS segment override prefix into lock prefix */
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- if (noreplace_smp)
- return;
-
mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -338,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
/* turn lock prefix into DS segment override prefix */
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
- };
+ }
mutex_unlock(&text_mutex);
}
@@ -359,7 +343,7 @@ struct smp_alt_module {
};
static LIST_HEAD(smp_alt_modules);
static DEFINE_MUTEX(smp_alt);
-static int smp_mode = 1; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by smp_alt */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- if (noreplace_smp)
- return;
+ mutex_lock(&smp_alt);
+ if (!uniproc_patched)
+ goto unlock;
- if (smp_alt_once) {
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(locks, locks_end,
- text, text_end);
- return;
- }
+ if (num_possible_cpus() == 1)
+ /* Don't bother remembering, we'll never have to undo it. */
+ goto smp_unlock;
smp = kzalloc(sizeof(*smp), GFP_KERNEL);
if (NULL == smp)
- return; /* we'll run the (safe but slow) SMP code then ... */
+ /* we'll run the (safe but slow) SMP code then ... */
+ goto unlock;
smp->mod = mod;
smp->name = name;
@@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
__func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- mutex_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
- if (boot_cpu_has(X86_FEATURE_UP))
- alternatives_smp_unlock(smp->locks, smp->locks_end,
- smp->text, smp->text_end);
+smp_unlock:
+ alternatives_smp_unlock(locks, locks_end, text, text_end);
+unlock:
mutex_unlock(&smp_alt);
}
@@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- if (smp_alt_once || noreplace_smp)
- return;
-
mutex_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- mutex_unlock(&smp_alt);
- DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
- return;
+ break;
}
mutex_unlock(&smp_alt);
}
-bool skip_smp_alternatives;
-void alternatives_smp_switch(int smp)
+void alternatives_enable_smp(void)
{
struct smp_alt_module *mod;
@@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp)
pr_info("lockdep: fixing up alternatives\n");
#endif
- if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
- return;
- BUG_ON(!smp && (num_online_cpus() > 1));
+ /* Why bother if there are no other CPUs? */
+ BUG_ON(num_possible_cpus() == 1);
mutex_lock(&smp_alt);
- /*
- * Avoid unnecessary switches because it forces JIT based VMs to
- * throw away all cached translations, which can be quite costly.
- */
- if (smp == smp_mode) {
- /* nothing */
- } else if (smp) {
+ if (uniproc_patched) {
pr_info("switching to SMP code\n");
+ BUG_ON(num_online_cpus() != 1);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
list_for_each_entry(mod, &smp_alt_modules, next)
alternatives_smp_lock(mod->locks, mod->locks_end,
mod->text, mod->text_end);
- } else {
- pr_info("switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
- list_for_each_entry(mod, &smp_alt_modules, next)
- alternatives_smp_unlock(mod->locks, mod->locks_end,
- mod->text, mod->text_end);
+ uniproc_patched = false;
}
- smp_mode = smp;
mutex_unlock(&smp_alt);
}
@@ -540,40 +503,22 @@ void __init alternative_instructions(void)
apply_alternatives(__alt_instructions, __alt_instructions_end);
- /* switch to patch-once-at-boottime-only mode and free the
- * tables in case we know the number of CPUs will never ever
- * change */
-#ifdef CONFIG_HOTPLUG_CPU
- if (num_possible_cpus() < 2)
- smp_alt_once = 1;
-#endif
-
#ifdef CONFIG_SMP
- if (smp_alt_once) {
- if (1 == num_possible_cpus()) {
- pr_info("switching to UP code\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
- set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
-
- alternatives_smp_unlock(__smp_locks, __smp_locks_end,
- _text, _etext);
- }
- } else {
+ /* Patch to UP if other cpus not imminent. */
+ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
+ uniproc_patched = true;
alternatives_smp_module_add(NULL, "core kernel",
__smp_locks, __smp_locks_end,
_text, _etext);
-
- /* Only switch to UP mode if we don't immediately boot others */
- if (num_present_cpus() == 1 || setup_max_cpus <= 1)
- alternatives_smp_switch(0);
}
-#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
- if (smp_alt_once)
+ if (!uniproc_patched || num_possible_cpus() == 1)
free_init_pages("SMP alternatives",
(unsigned long)__smp_locks,
(unsigned long)__smp_locks_end);
+#endif
+
+ apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
}
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 24deb3082328..b17416e72fbd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
i++;
v1 >>= 1;
- };
+ }
apic_printk(APIC_DEBUG, KERN_CONT "\n");
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index bc552cff2578..a65829ac2b9a 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -30,7 +30,7 @@
static int numachip_system __read_mostly;
-static struct apic apic_numachip __read_mostly;
+static const struct apic apic_numachip __read_mostly;
static unsigned int get_apic_id(unsigned long x)
{
@@ -199,7 +199,7 @@ static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
return 0;
}
-static struct apic apic_numachip __refconst = {
+static const struct apic apic_numachip __refconst = {
.name = "NumaConnect system",
.probe = numachip_probe,
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 68de2dc962ec..28610822fb3c 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -69,4 +69,7 @@ void common(void) {
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
OFFSET(BP_pref_address, boot_params, hdr.pref_address);
OFFSET(BP_code32_start, boot_params, hdr.code32_start);
+
+ BLANK();
+ DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d30a6a9a0121..a0e067d3d96c 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,7 +32,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
ifdef CONFIG_PERF_EVENTS
obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9d92e19039f0..f7e98a2c0d12 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
}
#endif
+static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has_invlpg)
+ return;
+
+ tlb_flushall_shift = 5;
+
+ if (c->x86 <= 0x11)
+ tlb_flushall_shift = 4;
+}
+
+static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
+{
+ u32 ebx, eax, ecx, edx;
+ u16 mask = 0xfff;
+
+ if (c->x86 < 0xf)
+ return;
+
+ if (c->extended_cpuid_level < 0x80000006)
+ return;
+
+ cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
+
+ tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
+ tlb_lli_4k[ENTRIES] = ebx & mask;
+
+ /*
+ * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
+ * characteristics from the CPUID function 0x80000005 instead.
+ */
+ if (c->x86 == 0xf) {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ mask = 0xff;
+ }
+
+ /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!((eax >> 16) & mask)) {
+ u32 a, b, c, d;
+
+ cpuid(0x80000005, &a, &b, &c, &d);
+ tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
+ } else {
+ tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
+ }
+
+ /* a 4M entry uses two 2M entries */
+ tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
+
+ /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
+ if (!(eax & mask)) {
+ /* Erratum 658 */
+ if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
+ tlb_lli_2m[ENTRIES] = 1024;
+ } else {
+ cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
+ tlb_lli_2m[ENTRIES] = eax & 0xff;
+ }
+ } else
+ tlb_lli_2m[ENTRIES] = eax & mask;
+
+ tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
+
+ cpu_set_tlb_flushall_shift(c);
+}
+
static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_vendor = "AMD",
.c_ident = { "AuthenticAMD" },
@@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
.c_size_cache = amd_size_cache,
#endif
.c_early_init = early_init_amd,
+ .c_detect_tlb = cpu_detect_tlb_amd,
.c_bsp_init = bsp_init_amd,
.c_init = init_amd,
.c_x86_vendor = X86_VENDOR_AMD,
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fefd9b7e93e1..7505f7b13e71 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -489,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
- "tlb_flushall_shift is 0x%x\n",
+ "tlb_flushall_shift: %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
@@ -955,8 +955,7 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
- if (boot_cpu_data.cpuid_level >= 2)
- cpu_detect_tlb(&boot_cpu_data);
+ cpu_detect_tlb(&boot_cpu_data);
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1036,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
printk(KERN_CONT "%s ", vendor);
if (c->x86_model_id[0])
- printk(KERN_CONT "%s", c->x86_model_id);
+ printk(KERN_CONT "%s", strim(c->x86_model_id));
else
printk(KERN_CONT "%d86", c->x86);
+ printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT " stepping %02x\n", c->x86_mask);
+ printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
else
- printk(KERN_CONT "\n");
+ printk(KERN_CONT ")\n");
print_cpu_msr(c);
}
@@ -1130,8 +1131,6 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC);
}
-unsigned long kernel_eflags;
-
/*
* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
@@ -1312,8 +1311,6 @@ void __cpuinit cpu_init(void)
fpu_init();
- raw_local_save_flags(kernel_eflags);
-
if (is_uv_system())
uv_cpu_init();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0a4ce2980a5a..198e019a531a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
int i, j, n;
unsigned int regs[4];
unsigned char *desc = (unsigned char *)regs;
+
+ if (c->cpuid_level < 2)
+ return;
+
/* Number of times to iterate */
n = cpuid_eax(2) & 0xFF;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index fc4beb393577..ddc72f839332 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
}
static cpumask_var_t mce_inject_cpumask;
+static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
@@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
put_online_cpus();
} else
#endif
+ {
+ preempt_disable();
raise_local();
+ preempt_enable();
+ }
}
/* Error injection interface */
@@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
* so do it a jiffie or two later everywhere.
*/
schedule_timeout(2);
+
+ mutex_lock(&mce_inject_mutex);
raise_mce(&m);
+ mutex_unlock(&mce_inject_mutex);
return usize;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index ed44c8a65858..6a05c1d327a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,18 @@ extern int mce_ser;
extern struct mce_bank *mce_banks;
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
+void mce_intel_hcpu_update(unsigned long cpu);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
+static inline void mce_intel_hcpu_update(unsigned long cpu) { }
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 292d0258311c..46cbf8689692 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
int mce_cmci_disabled __read_mostly;
int mce_ignore_ce __read_mostly;
int mce_ser __read_mostly;
+int mce_bios_cmci_threshold __read_mostly;
struct mce_bank *mce_banks __read_mostly;
@@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data)
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
@@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data)
* polling interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
- if (mce_notify_irq())
+ if (mce_notify_irq()) {
iv = max(iv / 2, (unsigned long) HZ/100);
- else
+ } else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
__this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, interval);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
-static void __mcheck_cpu_init_timer(void)
+static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
- unsigned long iv = check_interval * HZ;
+ unsigned long iv = mce_adjust_timer(check_interval * HZ);
- setup_timer(t, mce_timer_fn, smp_processor_id());
+ __this_cpu_write(mce_next_interval, iv);
- if (mce_ignore_ce)
+ if (mce_ignore_ce || !iv)
return;
- __this_cpu_write(mce_next_interval, iv);
- if (!iv)
- return;
t->expires = round_jiffies(jiffies + iv);
add_timer_on(t, smp_processor_id());
}
+static void __mcheck_cpu_init_timer(void)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_timer(t, mce_timer_fn, cpu);
+ mce_start_timer(cpu, t);
+}
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
@@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
* check, or 0 to not wait
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
+ * mce=bios_cmci_threshold Don't program the CMCI threshold
*/
static int __init mcheck_enable(char *str)
{
@@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)
mce_ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
+ else if (!strcmp(str, "bios_cmci_threshold"))
+ mce_bios_cmci_threshold = 1;
else if (isdigit(str[0])) {
get_option(&str, &tolerant);
if (*str == ',') {
@@ -2294,38 +2337,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
case CPU_DEAD:
- case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
mce_device_remove(cpu);
+ mce_intel_hcpu_update(cpu);
break;
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- if (!mce_ignore_ce && check_interval) {
- t->expires = round_jiffies(jiffies +
- per_cpu(mce_next_interval, cpu));
- add_timer_on(t, cpu);
- }
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+ mce_start_timer(cpu, t);
break;
- case CPU_POST_DEAD:
+ }
+
+ if (action == CPU_POST_DEAD) {
/* intentionally ignoring frozen here */
cmci_rediscover(cpu);
- break;
}
+
return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 38e49bc95ffc..5f88abf07e9c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
#include <asm/msr.h>
#include <asm/mce.h>
+#include "mce-internal.h"
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-#define CMCI_THRESHOLD 1
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_THRESHOLD 15
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
@@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+void mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return;
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
+void mce_intel_hcpu_update(unsigned long cpu)
+{
+ if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
+ atomic_dec(&cmci_storm_on_cpus);
+
+ per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
+}
+
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+ int r;
+
+ if (interval < CMCI_POLL_INTERVAL)
+ return interval;
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the
+ * timer interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ r = atomic_sub_return(1, &cmci_storm_on_cpus);
+ if (r == 0)
+ pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+ /* FALLTHROUGH */
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all cpus to go back to SUBSIDED
+ * state. When that happens we switch back to
+ * interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_reenable();
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+ /*
+ * We have shiny weather. Let the poll do whatever it
+ * thinks.
+ */
+ return interval;
+ }
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+ int r;
+
+ if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
+ return true;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_THRESHOLD)
+ return false;
+
+ cmci_clear();
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ r = atomic_add_return(1, &cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_POLL_INTERVAL);
+
+ if (r == 1)
+ pr_notice("CMCI storm detected: switching to poll mode\n");
+ return true;
+}
+
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -61,33 +165,28 @@ static int cmci_supported(int *banks)
*/
static void intel_threshold_interrupt(void)
{
+ if (cmci_storm_detect())
+ return;
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_irq();
}
-static void print_update(char *type, int *hdr, int num)
-{
- if (*hdr == 0)
- printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
- *hdr = 1;
- printk(KERN_CONT " %s:%d", type, num);
-}
-
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
-static void cmci_discover(int banks, int boot)
+static void cmci_discover(int banks)
{
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
unsigned long flags;
- int hdr = 0;
int i;
+ int bios_wrong_thresh = 0;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
for (i = 0; i < banks; i++) {
u64 val;
+ int bios_zero_thresh = 0;
if (test_bit(i, owned))
continue;
@@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot)
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
- if (test_and_clear_bit(i, owned) && !boot)
- print_update("SHD", &hdr, i);
+ clear_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
- val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
+ if (!mce_bios_cmci_threshold) {
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= CMCI_THRESHOLD;
+ } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
+ /*
+ * If bios_cmci_threshold boot option was specified
+ * but the threshold is zero, we'll try to initialize
+ * it to 1.
+ */
+ bios_zero_thresh = 1;
+ val |= CMCI_THRESHOLD;
+ }
+
+ val |= MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
- if (!test_and_set_bit(i, owned) && !boot)
- print_update("CMCI", &hdr, i);
+ set_bit(i, owned);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
+ /*
+ * We are able to set thresholds for some banks that
+ * had a threshold of 0. This means the BIOS has not
+ * set the thresholds properly or does not work with
+ * this boot option. Note down now and report later.
+ */
+ if (mce_bios_cmci_threshold && bios_zero_thresh &&
+ (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
+ bios_wrong_thresh = 1;
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
- if (hdr)
- printk(KERN_CONT "\n");
+ if (mce_bios_cmci_threshold && bios_wrong_thresh) {
+ pr_info_once(
+ "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
+ pr_info_once(
+ "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
+ }
}
/*
@@ -156,7 +278,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
+ val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
@@ -186,7 +308,7 @@ void cmci_rediscover(int dying)
continue;
/* Recheck banks in case CPUs don't all have the same */
if (cmci_supported(&banks))
- cmci_discover(banks, 0);
+ cmci_discover(banks);
}
set_cpus_allowed_ptr(current, old);
@@ -200,7 +322,7 @@ void cmci_reenable(void)
{
int banks;
if (cmci_supported(&banks))
- cmci_discover(banks, 0);
+ cmci_discover(banks);
}
static void intel_init_cmci(void)
@@ -211,7 +333,7 @@ static void intel_init_cmci(void)
return;
mce_threshold_vector = intel_threshold_interrupt;
- cmci_discover(banks, 1);
+ cmci_discover(banks);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index c7b3fe2d72e0..091972ef49de 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -8,7 +8,10 @@
open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
-print OUT "#include <asm/cpufeature.h>\n\n";
+print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n";
+print OUT "#include <asm/cpufeature.h>\n";
+print OUT "#endif\n";
+print OUT "\n";
print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
%features = ();
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6605a81ba339..271d25700297 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -586,6 +586,8 @@ extern struct event_constraint intel_westmere_pebs_event_constraints[];
extern struct event_constraint intel_snb_pebs_event_constraints[];
+extern struct event_constraint intel_ivb_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -624,6 +626,8 @@ int p4_pmu_init(void);
int p6_pmu_init(void);
+int knc_pmu_init(void);
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index 7bfb5bec8630..6336bcbd0618 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -41,17 +41,22 @@ struct cpu_perf_ibs {
};
struct perf_ibs {
- struct pmu pmu;
- unsigned int msr;
- u64 config_mask;
- u64 cnt_mask;
- u64 enable_mask;
- u64 valid_mask;
- u64 max_period;
- unsigned long offset_mask[1];
- int offset_max;
- struct cpu_perf_ibs __percpu *pcpu;
- u64 (*get_count)(u64 config);
+ struct pmu pmu;
+ unsigned int msr;
+ u64 config_mask;
+ u64 cnt_mask;
+ u64 enable_mask;
+ u64 valid_mask;
+ u64 max_period;
+ unsigned long offset_mask[1];
+ int offset_max;
+ struct cpu_perf_ibs __percpu *pcpu;
+
+ struct attribute **format_attrs;
+ struct attribute_group format_group;
+ const struct attribute_group *attr_groups[2];
+
+ u64 (*get_count)(u64 config);
};
struct perf_ibs_data {
@@ -209,6 +214,15 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
return -EOPNOTSUPP;
}
+static const struct perf_event_attr ibs_notsupp = {
+ .exclude_user = 1,
+ .exclude_kernel = 1,
+ .exclude_hv = 1,
+ .exclude_idle = 1,
+ .exclude_host = 1,
+ .exclude_guest = 1,
+};
+
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -229,6 +243,9 @@ static int perf_ibs_init(struct perf_event *event)
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
+ if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
+ return -EINVAL;
+
if (config & ~perf_ibs->config_mask)
return -EINVAL;
@@ -434,6 +451,19 @@ static void perf_ibs_del(struct perf_event *event, int flags)
static void perf_ibs_read(struct perf_event *event) { }
+PMU_FORMAT_ATTR(rand_en, "config:57");
+PMU_FORMAT_ATTR(cnt_ctl, "config:19");
+
+static struct attribute *ibs_fetch_format_attrs[] = {
+ &format_attr_rand_en.attr,
+ NULL,
+};
+
+static struct attribute *ibs_op_format_attrs[] = {
+ NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
+ NULL,
+};
+
static struct perf_ibs perf_ibs_fetch = {
.pmu = {
.task_ctx_nr = perf_invalid_context,
@@ -453,6 +483,7 @@ static struct perf_ibs perf_ibs_fetch = {
.max_period = IBS_FETCH_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
.offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
+ .format_attrs = ibs_fetch_format_attrs,
.get_count = get_ibs_fetch_count,
};
@@ -476,6 +507,7 @@ static struct perf_ibs perf_ibs_op = {
.max_period = IBS_OP_MAX_CNT << 4,
.offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
.offset_max = MSR_AMD64_IBSOP_REG_COUNT,
+ .format_attrs = ibs_op_format_attrs,
.get_count = get_ibs_op_count,
};
@@ -585,6 +617,17 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
perf_ibs->pcpu = pcpu;
+ /* register attributes */
+ if (perf_ibs->format_attrs[0]) {
+ memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
+ perf_ibs->format_group.name = "format";
+ perf_ibs->format_group.attrs = perf_ibs->format_attrs;
+
+ memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
+ perf_ibs->attr_groups[0] = &perf_ibs->format_group;
+ perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
+ }
+
ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
if (ret) {
perf_ibs->pcpu = NULL;
@@ -596,13 +639,19 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
static __init int perf_event_ibs_init(void)
{
+ struct attribute **attr = ibs_op_format_attrs;
+
if (!ibs_caps)
return -ENODEV; /* ibs not supported by the cpu */
perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
- if (ibs_caps & IBS_CAPS_OPCNT)
+
+ if (ibs_caps & IBS_CAPS_OPCNT) {
perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
+ *attr++ = &format_attr_cnt_ctl.attr;
+ }
perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
+
register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f2739e03e79..324bb523d9d9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1906,6 +1906,8 @@ __init int intel_pmu_init(void)
switch (boot_cpu_data.x86) {
case 0x6:
return p6_pmu_init();
+ case 0xb:
+ return knc_pmu_init();
case 0xf:
return p4_pmu_init();
}
@@ -2008,6 +2010,7 @@ __init int intel_pmu_init(void)
break;
case 28: /* Atom */
+ case 54: /* Cedariew */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2047,7 +2050,6 @@ __init int intel_pmu_init(void)
case 42: /* SandyBridge */
case 45: /* SandyBridge, "Romely-EP" */
x86_add_quirk(intel_sandybridge_quirk);
- case 58: /* IvyBridge */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2072,6 +2074,29 @@ __init int intel_pmu_init(void)
pr_cont("SandyBridge events, ");
break;
+ case 58: /* IvyBridge */
+ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+
+ intel_pmu_lbr_init_snb();
+
+ x86_pmu.event_constraints = intel_snb_event_constraints;
+ x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ x86_pmu.extra_regs = intel_snb_extra_regs;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+ /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
+ X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
+
+ pr_cont("IvyBridge events, ");
+ break;
+
default:
switch (x86_pmu.version) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index e38d97bf4259..826054a4f2ee 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -407,6 +407,20 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_ivb_pebs_event_constraints[] = {
+ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
+ INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 520b4265fcd2..da02e9cc3754 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void)
* to have an operational LBR which can freeze
* on PMU interrupt
*/
- if (boot_cpu_data.x86_mask < 10) {
+ if (boot_cpu_data.x86_model == 28
+ && boot_cpu_data.x86_mask < 10) {
pr_cont("LBR disabled due to erratum");
return;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 0a5571080e74..99d96a4978b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -661,6 +661,11 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
}
}
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
static struct attribute *snb_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -704,6 +709,7 @@ static struct intel_uncore_type snb_uncore_cbox = {
.constraints = snb_uncore_cbox_constraints,
.ops = &snb_uncore_msr_ops,
.format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
};
static struct intel_uncore_type *snb_msr_uncores[] = {
@@ -1944,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp
static struct intel_uncore_box *
uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
{
- static struct intel_uncore_box *box;
+ struct intel_uncore_box *box;
box = *per_cpu_ptr(pmu->box, cpu);
if (box)
@@ -2341,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event)
return ret;
}
+static ssize_t uncore_get_attr_cpumask(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
+
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ return n;
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
+
+static struct attribute *uncore_pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group uncore_pmu_attr_group = {
+ .attrs = uncore_pmu_attrs,
+};
+
static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -2378,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)
free_percpu(type->pmus[i].box);
kfree(type->pmus);
type->pmus = NULL;
- kfree(type->attr_groups[1]);
- type->attr_groups[1] = NULL;
+ kfree(type->events_group);
+ type->events_group = NULL;
}
static void __init uncore_types_exit(struct intel_uncore_type **types)
@@ -2431,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
for (j = 0; j < i; j++)
attrs[j] = &type->event_descs[j].attr.attr;
- type->attr_groups[1] = events_group;
+ type->events_group = events_group;
}
+ type->pmu_group = &uncore_pmu_attr_group;
type->pmus = pmus;
return 0;
fail:
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 5b81c1856aac..e68a4550e952 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -369,10 +369,12 @@ struct intel_uncore_type {
struct intel_uncore_pmu *pmus;
struct intel_uncore_ops *ops;
struct uncore_event_desc *event_descs;
- const struct attribute_group *attr_groups[3];
+ const struct attribute_group *attr_groups[4];
};
-#define format_group attr_groups[0]
+#define pmu_group attr_groups[0]
+#define format_group attr_groups[1]
+#define events_group attr_groups[2]
struct intel_uncore_ops {
void (*init_box)(struct intel_uncore_box *);
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
new file mode 100644
index 000000000000..7c46bfdbc373
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -0,0 +1,248 @@
+/* Driver for Intel Xeon Phi "Knights Corner" PMU */
+
+#include <linux/perf_event.h>
+#include <linux/types.h>
+
+#include "perf_event.h"
+
+static const u64 knc_perfmon_event_map[] =
+{
+ [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
+ [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
+ [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
+};
+
+static __initconst u64 knc_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+ [ C(OP_READ) ] = {
+ /* On Xeon Phi event "0" is a valid DATA_READ */
+ /* (L1 Data Cache Reads) Instruction. */
+ /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
+ /* bit will always be set in x86_pmu_hw_config(). */
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
+ [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0,
+ [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
+ [ C(RESULT_MISS) ] = 0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
+ [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
+ /* DATA_READ */
+ /* see note on L1 OP_READ */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
+ [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
+ [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
+ [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+
+static u64 knc_pmu_event_map(int hw_event)
+{
+ return knc_perfmon_event_map[hw_event];
+}
+
+static struct event_constraint knc_event_constraints[] =
+{
+ INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
+ INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
+ INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
+ INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
+ INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
+ INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
+ INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
+ INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
+ INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
+ INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
+ INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
+ INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
+ INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
+ INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
+ INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
+ INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
+ INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
+ EVENT_CONSTRAINT_END
+};
+
+#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
+#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
+#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
+
+#define KNC_ENABLE_COUNTER0 0x00000001
+#define KNC_ENABLE_COUNTER1 0x00000002
+
+static void knc_pmu_disable_all(void)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static void knc_pmu_enable_all(int added)
+{
+ u64 val;
+
+ rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+ val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
+ wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
+}
+
+static inline void
+knc_pmu_disable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+static void knc_pmu_enable_event(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ u64 val;
+
+ val = hwc->config;
+ if (cpuc->enabled)
+ val |= ARCH_PERFMON_EVENTSEL_ENABLE;
+
+ (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
+}
+
+PMU_FORMAT_ATTR(event, "config:0-7" );
+PMU_FORMAT_ATTR(umask, "config:8-15" );
+PMU_FORMAT_ATTR(edge, "config:18" );
+PMU_FORMAT_ATTR(inv, "config:23" );
+PMU_FORMAT_ATTR(cmask, "config:24-31" );
+
+static struct attribute *intel_knc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask.attr,
+ NULL,
+};
+
+static __initconst struct x86_pmu knc_pmu = {
+ .name = "knc",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = knc_pmu_disable_all,
+ .enable_all = knc_pmu_enable_all,
+ .enable = knc_pmu_enable_event,
+ .disable = knc_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_KNC_EVNTSEL0,
+ .perfctr = MSR_KNC_PERFCTR0,
+ .event_map = knc_pmu_event_map,
+ .max_events = ARRAY_SIZE(knc_perfmon_event_map),
+ .apic = 1,
+ .max_period = (1ULL << 31) - 1,
+ .version = 0,
+ .num_counters = 2,
+ /* in theory 40 bits, early silicon is buggy though */
+ .cntval_bits = 32,
+ .cntval_mask = (1ULL << 32) - 1,
+ .get_event_constraints = x86_get_event_constraints,
+ .event_constraints = knc_event_constraints,
+ .format_attrs = intel_knc_formats_attr,
+};
+
+__init int knc_pmu_init(void)
+{
+ x86_pmu = knc_pmu;
+
+ memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 966512b2cacf..2e8caf03f593 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -56,6 +56,8 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_PERFCTR0;
+ case 11:
+ return msr - MSR_KNC_PERFCTR0;
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
@@ -82,6 +84,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
switch (boot_cpu_data.x86) {
case 6:
return msr - MSR_P6_EVNTSEL0;
+ case 11:
+ return msr - MSR_KNC_EVNTSEL0;
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 8022c6681485..fbd895562292 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = cpumask_first(cpu_online_mask);
- else
- *pos = cpumask_next(*pos - 1, cpu_online_mask);
+ *pos = cpumask_next(*pos - 1, cpu_online_mask);
if ((*pos) < nr_cpu_ids)
return &cpu_data(*pos);
return NULL;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 39472dd2323f..60c78917190c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -199,12 +199,14 @@ static int __init cpuid_init(void)
goto out_chrdev;
}
cpuid_class->devnode = cpuid_devnode;
+ get_online_cpus();
for_each_online_cpu(i) {
err = cpuid_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ put_online_cpus();
err = 0;
goto out;
@@ -214,6 +216,7 @@ out_class:
for_each_online_cpu(i) {
cpuid_device_destroy(i);
}
+ put_online_cpus();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)
{
int cpu = 0;
+ get_online_cpus();
for_each_online_cpu(cpu)
cpuid_device_destroy(cpu);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
+ put_online_cpus();
}
module_init(cpuid_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3ae2ced4a874..b1581527a236 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = {
.xlate = ioapic_xlate,
};
+static void dt_add_ioapic_domain(unsigned int ioapic_num,
+ struct device_node *np)
+{
+ struct irq_domain *id;
+ struct mp_ioapic_gsi *gsi_cfg;
+ int ret;
+ int num;
+
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
+ num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+
+ id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
+ (void *)ioapic_num);
+ BUG_ON(!id);
+ if (gsi_cfg->gsi_base == 0) {
+ /*
+ * The first NR_IRQS_LEGACY irq descs are allocated in
+ * early_irq_init() and need just a mapping. The
+ * remaining irqs need both. All of them are preallocated
+ * and assigned so we can keep the 1:1 mapping which the ioapic
+ * is having.
+ */
+ ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
+ if (ret)
+ pr_err("Error mapping legacy IRQs: %d\n", ret);
+
+ if (num > NR_IRQS_LEGACY) {
+ ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
+ NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
+ if (ret)
+ pr_err("Error creating mapping for the "
+ "remaining IRQs: %d\n", ret);
+ }
+ irq_set_default_host(id);
+ } else {
+ ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
+ if (ret)
+ pr_err("Error creating IRQ mapping: %d\n", ret);
+ }
+}
+
static void __init ioapic_add_ofnode(struct device_node *np)
{
struct resource r;
@@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
for (i = 0; i < nr_ioapics; i++) {
if (r.start == mpc_ioapic_addr(i)) {
- struct irq_domain *id;
- struct mp_ioapic_gsi *gsi_cfg;
-
- gsi_cfg = mp_ioapic_gsi_routing(i);
-
- id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0,
- &ioapic_irq_domain_ops,
- (void*)i);
- BUG_ON(!id);
+ dt_add_ioapic_domain(i, np);
return;
}
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 9ebbecab6e92..a1193aef6d7d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -299,6 +299,21 @@ ENTRY(ret_from_fork)
CFI_ENDPROC
END(ret_from_fork)
+ENTRY(ret_from_kernel_thread)
+ CFI_STARTPROC
+ pushl_cfi %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl_cfi %eax
+ pushl_cfi $0x0202 # Reset kernel eflags
+ popfl_cfi
+ movl PT_EBP(%esp),%eax
+ call *PT_EBX(%esp)
+ movl $0,PT_EAX(%esp)
+ jmp syscall_exit
+ CFI_ENDPROC
+ENDPROC(ret_from_kernel_thread)
+
/*
* Interrupt exit functions should be protected against kprobes
*/
@@ -323,8 +338,7 @@ ret_from_intr:
andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
#else
/*
- * We can be coming here from a syscall done in the kernel space,
- * e.g. a failed kernel_execve().
+ * We can be coming here from child spawned by kernel_thread().
*/
movl PT_CS(%esp), %eax
andl $SEGMENT_RPL_MASK, %eax
@@ -616,6 +630,10 @@ work_notifysig: # deal with pending signals and
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
+1:
+#else
+ movl %esp, %eax
+#endif
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
movb PT_CS(%esp), %bl
@@ -626,24 +644,15 @@ work_notifysig: # deal with pending signals and
call do_notify_resume
jmp resume_userspace
+#ifdef CONFIG_VM86
ALIGN
work_notifysig_v86:
pushl_cfi %ecx # save ti_flags for do_notify_resume
call save_v86_state # %eax contains pt_regs pointer
popl_cfi %ecx
movl %eax, %esp
-#else
- movl %esp, %eax
+ jmp 1b
#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- movb PT_CS(%esp), %bl
- andb $SEGMENT_RPL_MASK, %bl
- cmpb $USER_RPL, %bl
- jb resume_kernel
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
END(work_pending)
# perform syscall exit tracing
@@ -732,7 +741,6 @@ ENDPROC(ptregs_##name)
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
-PTREGSCALL3(execve)
PTREGSCALL2(sigaltstack)
PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
@@ -1015,16 +1023,6 @@ END(spurious_interrupt_bug)
*/
.popsection
-ENTRY(kernel_thread_helper)
- pushl $0 # fake return address for unwinder
- CFI_STARTPROC
- movl %edi,%eax
- call *%esi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-ENDPROC(kernel_thread_helper)
-
#ifdef CONFIG_XEN
/* Xen doesn't set %esp to be precisely what the normal sysenter
entrypoint expects, so fix it up before using the normal path. */
@@ -1130,17 +1128,21 @@ ENTRY(ftrace_caller)
pushl %eax
pushl %ecx
pushl %edx
- movl 0xc(%esp), %eax
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
movl 0x4(%ebp), %edx
+ leal function_trace_op, %ecx
subl $MCOUNT_INSN_SIZE, %eax
.globl ftrace_call
ftrace_call:
call ftrace_stub
+ addl $4,%esp /* skip NULL pointer */
popl %edx
popl %ecx
popl %eax
+ftrace_ret:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
@@ -1152,6 +1154,71 @@ ftrace_stub:
ret
END(ftrace_caller)
+ENTRY(ftrace_regs_caller)
+ pushf /* push flags before compare (in cs location) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * ip location, and move flags into the return ip location.
+ */
+ pushl 4(%esp) /* save return ip into ip slot */
+
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS,13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
+
+ftrace_restore_flags:
+ popf
+ jmp ftrace_stub
#else /* ! CONFIG_DYNAMIC_FTRACE */
ENTRY(mcount)
@@ -1192,9 +1259,6 @@ END(mcount)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
pushl %eax
pushl %ecx
pushl %edx
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ce87e3d10c63..0c58952d64e8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -56,6 +56,7 @@
#include <asm/ftrace.h>
#include <asm/percpu.h>
#include <asm/asm.h>
+#include <asm/rcu.h>
#include <asm/smap.h>
#include <linux/err.h>
@@ -69,25 +70,51 @@
.section .entry.text, "ax"
#ifdef CONFIG_FUNCTION_TRACER
+
+#ifdef CC_USING_FENTRY
+# define function_hook __fentry__
+#else
+# define function_hook mcount
+#endif
+
#ifdef CONFIG_DYNAMIC_FTRACE
-ENTRY(mcount)
+
+ENTRY(function_hook)
retq
-END(mcount)
+END(function_hook)
+
+/* skip is set if stack has been adjusted */
+.macro ftrace_caller_setup skip=0
+ MCOUNT_SAVE_FRAME \skip
+
+ /* Load the ftrace_ops into the 3rd parameter */
+ leaq function_trace_op, %rdx
+
+ /* Load ip into the first parameter */
+ movq RIP(%rsp), %rdi
+ subq $MCOUNT_INSN_SIZE, %rdi
+ /* Load the parent_ip into the second parameter */
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
+ movq 8(%rbp), %rsi
+#endif
+.endm
ENTRY(ftrace_caller)
+ /* Check if tracing was disabled (quick check) */
cmpl $0, function_trace_stop
jne ftrace_stub
- MCOUNT_SAVE_FRAME
-
- movq 0x38(%rsp), %rdi
- movq 8(%rbp), %rsi
- subq $MCOUNT_INSN_SIZE, %rdi
+ ftrace_caller_setup
+ /* regs go into 4th parameter (but make it NULL) */
+ movq $0, %rcx
GLOBAL(ftrace_call)
call ftrace_stub
MCOUNT_RESTORE_FRAME
+ftrace_return:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
GLOBAL(ftrace_graph_call)
@@ -98,8 +125,78 @@ GLOBAL(ftrace_stub)
retq
END(ftrace_caller)
+ENTRY(ftrace_regs_caller)
+ /* Save the current flags before compare (in SS location)*/
+ pushfq
+
+ /* Check if tracing was disabled (quick check) */
+ cmpl $0, function_trace_stop
+ jne ftrace_restore_flags
+
+ /* skip=8 to skip flags saved in SS */
+ ftrace_caller_setup 8
+
+ /* Save the rest of pt_regs */
+ movq %r15, R15(%rsp)
+ movq %r14, R14(%rsp)
+ movq %r13, R13(%rsp)
+ movq %r12, R12(%rsp)
+ movq %r11, R11(%rsp)
+ movq %r10, R10(%rsp)
+ movq %rbp, RBP(%rsp)
+ movq %rbx, RBX(%rsp)
+ /* Copy saved flags */
+ movq SS(%rsp), %rcx
+ movq %rcx, EFLAGS(%rsp)
+ /* Kernel segments */
+ movq $__KERNEL_DS, %rcx
+ movq %rcx, SS(%rsp)
+ movq $__KERNEL_CS, %rcx
+ movq %rcx, CS(%rsp)
+ /* Stack - skipping return address */
+ leaq SS+16(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ /* Copy flags back to SS, to restore them */
+ movq EFLAGS(%rsp), %rax
+ movq %rax, SS(%rsp)
+
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, SS+8(%rsp)
+
+ /* restore the rest of pt_regs */
+ movq R15(%rsp), %r15
+ movq R14(%rsp), %r14
+ movq R13(%rsp), %r13
+ movq R12(%rsp), %r12
+ movq R10(%rsp), %r10
+ movq RBP(%rsp), %rbp
+ movq RBX(%rsp), %rbx
+
+ /* skip=8 to skip flags saved in SS */
+ MCOUNT_RESTORE_FRAME 8
+
+ /* Restore flags */
+ popfq
+
+ jmp ftrace_return
+ftrace_restore_flags:
+ popfq
+ jmp ftrace_stub
+
+END(ftrace_regs_caller)
+
+
#else /* ! CONFIG_DYNAMIC_FTRACE */
-ENTRY(mcount)
+
+ENTRY(function_hook)
cmpl $0, function_trace_stop
jne ftrace_stub
@@ -120,8 +217,12 @@ GLOBAL(ftrace_stub)
trace:
MCOUNT_SAVE_FRAME
- movq 0x38(%rsp), %rdi
+ movq RIP(%rsp), %rdi
+#ifdef CC_USING_FENTRY
+ movq SS+16(%rsp), %rsi
+#else
movq 8(%rbp), %rsi
+#endif
subq $MCOUNT_INSN_SIZE, %rdi
call *ftrace_trace_function
@@ -129,20 +230,22 @@ trace:
MCOUNT_RESTORE_FRAME
jmp ftrace_stub
-END(mcount)
+END(function_hook)
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
ENTRY(ftrace_graph_caller)
- cmpl $0, function_trace_stop
- jne ftrace_stub
-
MCOUNT_SAVE_FRAME
+#ifdef CC_USING_FENTRY
+ leaq SS+16(%rsp), %rdi
+ movq $0, %rdx /* No framepointers needed */
+#else
leaq 8(%rbp), %rdi
- movq 0x38(%rsp), %rsi
movq (%rbp), %rdx
+#endif
+ movq RIP(%rsp), %rsi
subq $MCOUNT_INSN_SIZE, %rsi
call prepare_ftrace_return
@@ -343,15 +446,15 @@ ENDPROC(native_usergs_sysret64)
.macro SAVE_ARGS_IRQ
cld
/* start from rbp in pt_regs and jump over */
- movq_cfi rdi, RDI-RBP
- movq_cfi rsi, RSI-RBP
- movq_cfi rdx, RDX-RBP
- movq_cfi rcx, RCX-RBP
- movq_cfi rax, RAX-RBP
- movq_cfi r8, R8-RBP
- movq_cfi r9, R9-RBP
- movq_cfi r10, R10-RBP
- movq_cfi r11, R11-RBP
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
/* Save rbp so that we can unwind from get_irq_regs() */
movq_cfi rbp, 0
@@ -385,7 +488,7 @@ ENDPROC(native_usergs_sysret64)
.endm
ENTRY(save_rest)
- PARTIAL_FRAME 1 REST_SKIP+8
+ PARTIAL_FRAME 1 (REST_SKIP+8)
movq 5*8+16(%rsp), %r11 /* save return address */
movq_cfi rbx, RBX+16
movq_cfi rbp, RBP+16
@@ -441,7 +544,7 @@ ENTRY(ret_from_fork)
LOCK ; btr $TIF_FORK,TI_flags(%r8)
- pushq_cfi kernel_eflags(%rip)
+ pushq_cfi $0x0002
popfq_cfi # reset kernel eflags
call schedule_tail # rdi: 'prev' task parameter
@@ -451,7 +554,7 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- jz retint_restore_args
+ jz 1f
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -459,6 +562,14 @@ ENTRY(ret_from_fork)
RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
jmp ret_from_sys_call # go to the SYSRET fastpath
+1:
+ subq $REST_SKIP, %rsp # leave space for volatiles
+ CFI_ADJUST_CFA_OFFSET REST_SKIP
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_REST
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(ret_from_fork)
@@ -567,7 +678,7 @@ sysret_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
jmp sysret_check
@@ -680,7 +791,7 @@ int_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -759,7 +870,6 @@ ENTRY(stub_execve)
PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
call sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
@@ -809,8 +919,7 @@ ENTRY(stub_x32_execve)
PARTIAL_FRAME 0
SAVE_REST
FIXUP_TOP_OF_STACK %r11
- movq %rsp, %rcx
- call sys32_execve
+ call compat_sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
RESTORE_REST
@@ -977,7 +1086,7 @@ retint_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
pushq_cfi %rdi
- call schedule
+ SCHEDULE_USER
popq_cfi %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1215,52 +1324,6 @@ bad_gs:
jmp 2b
.previous
-ENTRY(kernel_thread_helper)
- pushq $0 # fake return address
- CFI_STARTPROC
- /*
- * Here we are in the child and the registers are set as they were
- * at kernel_thread() invocation in the parent.
- */
- call *%rsi
- # exit
- mov %eax, %edi
- call do_exit
- ud2 # padding for call trace
- CFI_ENDPROC
-END(kernel_thread_helper)
-
-/*
- * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
- *
- * C extern interface:
- * extern long execve(const char *name, char **argv, char **envp)
- *
- * asm input arguments:
- * rdi: name, rsi: argv, rdx: envp
- *
- * We want to fallback into:
- * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
- *
- * do_sys_execve asm fallback arguments:
- * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
- */
-ENTRY(kernel_execve)
- CFI_STARTPROC
- FAKE_STACK_FRAME $0
- SAVE_ALL
- movq %rsp,%rcx
- call sys_execve
- movq %rax, RAX(%rsp)
- RESTORE_REST
- testq %rax,%rax
- je int_ret_from_sys_call
- RESTORE_ARGS
- UNFAKE_STACK_FRAME
- ret
- CFI_ENDPROC
-END(kernel_execve)
-
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(call_softirq)
CFI_STARTPROC
@@ -1458,7 +1521,7 @@ paranoid_userspace:
paranoid_schedule:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
+ SCHEDULE_USER
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
jmp paranoid_userspace
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index c3a7cb4bf6e6..1d414029f1d8 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -206,6 +206,21 @@ static int
ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
unsigned const char *new_code);
+/*
+ * Should never be called:
+ * As it is only called by __ftrace_replace_code() which is called by
+ * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
+ * which is called to turn mcount into nops or nops into function calls
+ * but not to convert a function from not using regs to one that uses
+ * regs, which ftrace_modify_call() is for.
+ */
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+ unsigned long addr)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
+
int ftrace_update_ftrace_func(ftrace_func_t func)
{
unsigned long ip = (unsigned long)(&ftrace_call);
@@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
ret = ftrace_modify_code(ip, old, new);
+ /* Also update the regs callback function */
+ if (!ret) {
+ ip = (unsigned long)(&ftrace_regs_call);
+ memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ ret = ftrace_modify_code(ip, old, new);
+ }
+
atomic_dec(&modifying_ftrace_code);
return ret;
@@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)
return add_break(rec->ip, old);
}
+/*
+ * If the record has the FTRACE_FL_REGS set, that means that it
+ * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
+ * is not not set, then it wants to convert to the normal callback.
+ */
+static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
+/*
+ * The FTRACE_FL_REGS_EN is set when the record already points to
+ * a function that saves all the regs. Basically the '_EN' version
+ * represents the current state of the function.
+ */
+static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
+{
+ if (rec->flags & FTRACE_FL_REGS_EN)
+ return (unsigned long)FTRACE_REGS_ADDR;
+ else
+ return (unsigned long)FTRACE_ADDR;
+}
+
static int add_breakpoints(struct dyn_ftrace *rec, int enable)
{
unsigned long ftrace_addr;
@@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
@@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)
/* converting nop to call */
return add_brk_on_nop(rec);
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ ftrace_addr = get_ftrace_old_addr(rec);
+ /* fall through */
case FTRACE_UPDATE_MAKE_NOP:
/* converting a call to a nop */
return add_brk_on_call(rec, ftrace_addr);
@@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
* If not, don't touch the breakpoint, we make just create
* a disaster.
*/
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
+ nop = ftrace_call_replace(ip, ftrace_addr);
+
+ if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
+ goto update;
+
+ /* Check both ftrace_addr and ftrace_old_addr */
+ ftrace_addr = get_ftrace_old_addr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
+ update:
return probe_kernel_write((void *)ip, &nop[0], 1);
}
@@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_test_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return add_update_call(rec, ftrace_addr);
@@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable)
ret = ftrace_update_record(rec, enable);
- ftrace_addr = (unsigned long)FTRACE_ADDR;
+ ftrace_addr = get_ftrace_addr(rec);
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
case FTRACE_UPDATE_MAKE_CALL:
/* converting nop to call */
return finish_update_call(rec, ftrace_addr);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 36d1853e91af..9a5c460404dc 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -263,7 +263,7 @@ static void i8259A_shutdown(void)
* out of.
*/
outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
- outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
+ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
}
static struct syscore_ops i8259_syscore_ops = {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d44f7829968e..e4595f105910 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, " Rescheduling interrupts\n");
seq_printf(p, "%*s: ", prec, "CAL");
for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+ seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
+ irq_stats(j)->irq_tlb_count);
seq_printf(p, " Function call interrupts\n");
seq_printf(p, "%*s: ", prec, "TLB");
for_each_online_cpu(j)
@@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
#ifdef CONFIG_SMP
sum += irq_stats(cpu)->irq_resched_count;
sum += irq_stats(cpu)->irq_call_count;
- sum += irq_stats(cpu)->irq_tlb_count;
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
sum += irq_stats(cpu)->irq_thermal_count;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 3f61904365cf..836f8322960e 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -746,7 +746,9 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
+#ifdef CONFIG_DEBUG_RODATA
char opc[BREAK_INSTR_SIZE];
+#endif /* CONFIG_DEBUG_RODATA */
bpt->type = BP_BREAKPOINT;
err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index e2f751efb7b1..57916c0d3cf6 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -541,6 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
return 1;
}
+#ifdef KPROBES_CAN_USE_FTRACE
+static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ /*
+ * Emulate singlestep (and also recover regs->ip)
+ * as if there is a 5byte nop
+ */
+ regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ __this_cpu_write(current_kprobe, NULL);
+}
+#endif
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
@@ -599,6 +616,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
} else if (kprobe_running()) {
p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
+#ifdef KPROBES_CAN_USE_FTRACE
+ if (kprobe_ftrace(p)) {
+ skip_singlestep(p, regs, kcb);
+ return 1;
+ }
+#endif
setup_singlestep(p, regs, kcb, 0);
return 1;
}
@@ -1052,6 +1075,50 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
+#ifdef KPROBES_CAN_USE_FTRACE
+/* Ftrace callback handler for kprobes */
+void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ unsigned long flags;
+
+ /* Disable irq for emulating a breakpoint and avoiding preempt */
+ local_irq_save(flags);
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto end;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */
+ regs->ip = ip + sizeof(kprobe_opcode_t);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ skip_singlestep(p, regs, kcb);
+ /*
+ * If pre_handler returns !0, it sets regs->ip and
+ * resets current kprobe.
+ */
+ }
+end:
+ local_irq_restore(flags);
+}
+
+int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = -1;
+ return 0;
+}
+#endif
+
int __init arch_init_kprobes(void)
{
return arch_init_optprobes();
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index c1d61ee4b4f1..b3e5e51bc907 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -354,6 +354,7 @@ static void kvm_pv_guest_cpu_reboot(void *unused)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
wrmsrl(MSR_KVM_PV_EOI_EN, 0);
kvm_pv_disable_apf();
+ kvm_disable_steal_time();
}
static int kvm_pv_reboot_notify(struct notifier_block *nb,
@@ -396,9 +397,7 @@ void kvm_disable_steal_time(void)
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
-#ifdef CONFIG_KVM_CLOCK
WARN_ON(kvm_register_clock("primary cpu clock"));
-#endif
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 82746f942cd8..7720ff5a9ee2 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -75,20 +75,113 @@ struct microcode_amd {
static struct equiv_cpu_entry *equiv_cpu_table;
-/* page-sized ucode patch buffer */
-void *patch;
+struct ucode_patch {
+ struct list_head plist;
+ void *data;
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+static LIST_HEAD(pcache);
+
+static u16 find_equiv_id(unsigned int cpu)
+{
+ struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ int i = 0;
+
+ if (!equiv_cpu_table)
+ return 0;
+
+ while (equiv_cpu_table[i].installed_cpu != 0) {
+ if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
+ i++;
+ }
+ return 0;
+}
+
+static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu)
+{
+ int i = 0;
+
+ BUG_ON(!equiv_cpu_table);
+
+ while (equiv_cpu_table[i].equiv_cpu != 0) {
+ if (equiv_cpu == equiv_cpu_table[i].equiv_cpu)
+ return equiv_cpu_table[i].installed_cpu;
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * a small, trivial cache of per-family ucode patches
+ */
+static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist)
+ if (p->equiv_cpu == equiv_cpu)
+ return p;
+ return NULL;
+}
+
+static void update_cache(struct ucode_patch *new_patch)
+{
+ struct ucode_patch *p;
+
+ list_for_each_entry(p, &pcache, plist) {
+ if (p->equiv_cpu == new_patch->equiv_cpu) {
+ if (p->patch_id >= new_patch->patch_id)
+ /* we already have the latest patch */
+ return;
+
+ list_replace(&p->plist, &new_patch->plist);
+ kfree(p->data);
+ kfree(p);
+ return;
+ }
+ }
+ /* no patch found, add it */
+ list_add_tail(&new_patch->plist, &pcache);
+}
+
+static void free_cache(void)
+{
+ struct ucode_patch *p, *tmp;
+
+ list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ __list_del(p->plist.prev, p->plist.next);
+ kfree(p->data);
+ kfree(p);
+ }
+}
+
+static struct ucode_patch *find_patch(unsigned int cpu)
+{
+ u16 equiv_id;
+
+ equiv_id = find_equiv_id(cpu);
+ if (!equiv_id)
+ return NULL;
+
+ return cache_find_patch(equiv_id);
+}
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ csig->sig = cpuid_eax(0x00000001);
csig->rev = c->microcode;
pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
return 0;
}
-static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+static unsigned int verify_patch_size(int cpu, u32 patch_size,
unsigned int size)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,
return patch_size;
}
-static u16 find_equiv_id(void)
+static int apply_microcode_amd(int cpu)
{
- unsigned int current_cpu_id, i = 0;
-
- BUG_ON(equiv_cpu_table == NULL);
-
- current_cpu_id = cpuid_eax(0x00000001);
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
- return equiv_cpu_table[i].equiv_cpu;
-
- i++;
- }
- return 0;
-}
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_amd *mc_amd;
+ struct ucode_cpu_info *uci;
+ struct ucode_patch *p;
+ u32 rev, dummy;
-/*
- * we signal a good patch is found by returning its size > 0
- */
-static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
- unsigned int leftover_size, int rev,
- unsigned int *current_size)
-{
- struct microcode_header_amd *mc_hdr;
- unsigned int actual_size, patch_size;
- u16 equiv_cpu_id;
+ BUG_ON(raw_smp_processor_id() != cpu);
- /* size of the current patch we're staring at */
- patch_size = *(u32 *)(ucode_ptr + 4);
- *current_size = patch_size + SECTION_HDR_SIZE;
+ uci = ucode_cpu_info + cpu;
- equiv_cpu_id = find_equiv_id();
- if (!equiv_cpu_id)
+ p = find_patch(cpu);
+ if (!p)
return 0;
- /*
- * let's look at the patch header itself now
- */
- mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+ mc_amd = p->data;
+ uci->mc = p->data;
- if (mc_hdr->processor_rev_id != equiv_cpu_id)
- return 0;
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- /* ucode might be chipset specific -- currently we don't support this */
- if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
- pr_err("CPU%d: chipset specific code not yet supported\n",
- cpu);
+ /* need to apply patch? */
+ if (rev >= mc_amd->hdr.patch_id) {
+ c->microcode = rev;
return 0;
}
- if (mc_hdr->patch_id <= rev)
- return 0;
-
- /*
- * now that the header looks sane, verify its size
- */
- actual_size = verify_ucode_size(cpu, patch_size, leftover_size);
- if (!actual_size)
- return 0;
-
- /* clear the patch buffer */
- memset(patch, 0, PAGE_SIZE);
-
- /* all looks ok, get the binary patch */
- get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
-
- return actual_size;
-}
-
-static int apply_microcode_amd(int cpu)
-{
- u32 rev, dummy;
- int cpu_num = raw_smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
- struct microcode_amd *mc_amd = uci->mc;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- /* We should bind the task to the CPU */
- BUG_ON(cpu_num != cpu);
-
- if (mc_amd == NULL)
- return 0;
-
wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
- /* get patch id after patching */
- rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- /* check current patch id and patch's id for match */
+ /* verify patch application was successful */
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
if (rev != mc_amd->hdr.patch_id) {
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
cpu, mc_amd->hdr.patch_id);
@@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf)
return -ENOMEM;
}
- get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
+ memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
/* add header length */
return size + CONTAINER_HDR_SZ;
@@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void)
equiv_cpu_table = NULL;
}
-static enum ucode_state
-generic_load_microcode(int cpu, const u8 *data, size_t size)
+static void cleanup(void)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- struct microcode_header_amd *mc_hdr = NULL;
- unsigned int mc_size, leftover, current_size = 0;
+ free_equiv_cpu_table();
+ free_cache();
+}
+
+/*
+ * We return the current size even if some of the checks failed so that
+ * we can skip over the next patch. If we return a negative value, we
+ * signal a grave error like a memory allocation has failed and the
+ * driver cannot continue functioning normally. In such cases, we tear
+ * down everything we've used up so far and exit.
+ */
+static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ struct microcode_header_amd *mc_hdr;
+ struct ucode_patch *patch;
+ unsigned int patch_size, crnt_size, ret;
+ u32 proc_fam;
+ u16 proc_id;
+
+ patch_size = *(u32 *)(fw + 4);
+ crnt_size = patch_size + SECTION_HDR_SIZE;
+ mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
+ proc_id = mc_hdr->processor_rev_id;
+
+ proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
+ if (!proc_fam) {
+ pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
+ return crnt_size;
+ }
+
+ /* check if patch is for the current family */
+ proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
+ if (proc_fam != c->x86)
+ return crnt_size;
+
+ if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
+ pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
+ mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ ret = verify_patch_size(cpu, patch_size, leftover);
+ if (!ret) {
+ pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
+ return crnt_size;
+ }
+
+ patch = kzalloc(sizeof(*patch), GFP_KERNEL);
+ if (!patch) {
+ pr_err("Patch allocation failure.\n");
+ return -EINVAL;
+ }
+
+ patch->data = kzalloc(patch_size, GFP_KERNEL);
+ if (!patch->data) {
+ pr_err("Patch data allocation failure.\n");
+ kfree(patch);
+ return -EINVAL;
+ }
+
+ /* All looks ok, copy patch... */
+ memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
+ INIT_LIST_HEAD(&patch->plist);
+ patch->patch_id = mc_hdr->patch_id;
+ patch->equiv_cpu = proc_id;
+
+ /* ... and add to cache. */
+ update_cache(patch);
+
+ return crnt_size;
+}
+
+static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
+{
+ enum ucode_state ret = UCODE_ERROR;
+ unsigned int leftover;
+ u8 *fw = (u8 *)data;
+ int crnt_size = 0;
int offset;
- const u8 *ucode_ptr = data;
- void *new_mc = NULL;
- unsigned int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_ERROR;
- offset = install_equiv_cpu_table(ucode_ptr);
+ offset = install_equiv_cpu_table(data);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
- goto out;
+ return ret;
}
- ucode_ptr += offset;
+ fw += offset;
leftover = size - offset;
- if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+ if (*(u32 *)fw != UCODE_UCODE_TYPE) {
pr_err("invalid type field in container file section header\n");
- goto free_table;
+ free_equiv_cpu_table();
+ return ret;
}
while (leftover) {
- mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
- new_rev, &current_size);
- if (mc_size) {
- mc_hdr = patch;
- new_mc = patch;
- new_rev = mc_hdr->patch_id;
- goto out_ok;
- }
-
- ucode_ptr += current_size;
- leftover -= current_size;
- }
+ crnt_size = verify_and_add_patch(cpu, fw, leftover);
+ if (crnt_size < 0)
+ return ret;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto free_table;
+ fw += crnt_size;
+ leftover -= crnt_size;
}
-out_ok:
- uci->mc = new_mc;
- state = UCODE_OK;
- pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
- cpu, uci->cpu_sig.rev, new_rev);
-
-free_table:
- free_equiv_cpu_table();
-
-out:
- return state;
+ return UCODE_OK;
}
/*
@@ -315,7 +402,7 @@ out:
*
* This legacy file is always smaller than 2K in size.
*
- * Starting at family 15h they are in family specific firmware files:
+ * Beginning with family 15h, they are in family-specific firmware files:
*
* amd-ucode/microcode_amd_fam15h.bin
* amd-ucode/microcode_amd_fam16h.bin
@@ -323,12 +410,17 @@ out:
*
* These might be larger than 2K.
*/
-static enum ucode_state request_microcode_amd(int cpu, struct device *device)
+static enum ucode_state request_microcode_amd(int cpu, struct device *device,
+ bool refresh_fw)
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
- const struct firmware *fw;
- enum ucode_state ret = UCODE_NFOUND;
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ enum ucode_state ret = UCODE_NFOUND;
+ const struct firmware *fw;
+
+ /* reload ucode container only on the boot cpu */
+ if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ return UCODE_OK;
if (c->x86 >= 0x15)
snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
@@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)
goto fw_release;
}
- ret = generic_load_microcode(cpu, fw->data, fw->size);
+ /* free old equiv table */
+ free_equiv_cpu_table();
+
+ ret = load_microcode_amd(cpu, fw->data, fw->size);
+ if (ret != UCODE_OK)
+ cleanup();
-fw_release:
+ fw_release:
release_firmware(fw);
-out:
+ out:
return ret;
}
@@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void)
return NULL;
}
- patch = (void *)get_zeroed_page(GFP_KERNEL);
- if (!patch)
- return NULL;
-
return &microcode_amd_ops;
}
void __exit exit_amd_microcode(void)
{
- free_page((unsigned long)patch);
+ cleanup();
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 4873e62db6a1..3a04b224d0c0 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
if (do_microcode_update(buf, len) == 0)
ret = (ssize_t)len;
+ if (ret > 0)
+ perf_check_microcode();
+
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -276,19 +279,18 @@ static struct platform_device *microcode_pdev;
static int reload_for_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ enum ucode_state ustate;
int err = 0;
- if (uci->valid) {
- enum ucode_state ustate;
-
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
- if (ustate == UCODE_OK)
- apply_microcode_on_target(cpu);
- else
- if (ustate == UCODE_ERROR)
- err = -EINVAL;
- }
+ if (!uci->valid)
+ return err;
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
+ if (ustate == UCODE_OK)
+ apply_microcode_on_target(cpu);
+ else
+ if (ustate == UCODE_ERROR)
+ err = -EINVAL;
return err;
}
@@ -370,18 +372,15 @@ static void microcode_fini_cpu(int cpu)
static enum ucode_state microcode_resume_cpu(int cpu)
{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- if (!uci->mc)
- return UCODE_NFOUND;
-
pr_debug("CPU%d updated upon resume\n", cpu);
- apply_microcode_on_target(cpu);
+
+ if (apply_microcode_on_target(cpu))
+ return UCODE_ERROR;
return UCODE_OK;
}
-static enum ucode_state microcode_init_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
{
enum ucode_state ustate;
@@ -392,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu)
if (system_state != SYSTEM_RUNNING)
return UCODE_NFOUND;
- ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+ ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev,
+ refresh_fw);
if (ustate == UCODE_OK) {
pr_debug("CPU%d updated upon init\n", cpu);
@@ -405,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu)
static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- enum ucode_state ustate;
if (uci->valid)
- ustate = microcode_resume_cpu(cpu);
- else
- ustate = microcode_init_cpu(cpu);
+ return microcode_resume_cpu(cpu);
- return ustate;
+ return microcode_init_cpu(cpu, false);
}
static int mc_device_add(struct device *dev, struct subsys_interface *sif)
@@ -428,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
if (err)
return err;
- if (microcode_init_cpu(cpu) == UCODE_ERROR)
+ if (microcode_init_cpu(cpu, true) == UCODE_ERROR)
return -EINVAL;
return err;
@@ -477,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
struct device *dev;
dev = get_cpu_device(cpu);
- switch (action) {
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
microcode_update_cpu(cpu);
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
+ /*
+ * "break" is missing on purpose here because we want to fall
+ * through in order to create the sysfs group.
+ */
+
+ case CPU_DOWN_FAILED:
if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
+
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
/*
+ * case CPU_DEAD:
+ *
* When a CPU goes offline, don't free up or invalidate the copy of
* the microcode in kernel memory, so that we can reuse it when the
* CPU comes back online without unnecessarily requesting the userspace
* for it again.
*/
- case CPU_UP_CANCELED_FROZEN:
- /* The CPU refused to come up during a system resume */
- microcode_fini_cpu(cpu);
- break;
}
+
+ /* The CPU refused to come up during a system resume */
+ if (action == CPU_UP_CANCELED_FROZEN)
+ microcode_fini_cpu(cpu);
+
return NOTIFY_OK;
}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0327e2b3c408..3544aed39338 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
return 0;
}
-static enum ucode_state request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device,
+ bool refresh_fw)
{
char name[30];
struct cpuinfo_x86 *c = &cpu_data(cpu);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index eb113693f043..a7c5661f8496 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -257,12 +257,14 @@ static int __init msr_init(void)
goto out_chrdev;
}
msr_class->devnode = msr_devnode;
+ get_online_cpus();
for_each_online_cpu(i) {
err = msr_device_create(i);
if (err != 0)
goto out_class;
}
register_hotcpu_notifier(&msr_class_cpu_notifier);
+ put_online_cpus();
err = 0;
goto out;
@@ -271,6 +273,7 @@ out_class:
i = 0;
for_each_online_cpu(i)
msr_device_destroy(i);
+ put_online_cpus();
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -281,11 +284,13 @@ out:
static void __exit msr_exit(void)
{
int cpu = 0;
+ get_online_cpus();
for_each_online_cpu(cpu)
msr_device_destroy(cpu);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
unregister_hotcpu_notifier(&msr_class_cpu_notifier);
+ put_online_cpus();
}
module_init(msr_init);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
new file mode 100644
index 000000000000..e309cc5c276e
--- /dev/null
+++ b/arch/x86/kernel/perf_regs.c
@@ -0,0 +1,105 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <linux/stddef.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_32
+#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
+#else
+#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
+#endif
+
+#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
+
+static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
+ PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
+ PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
+ PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
+ PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
+ PT_REGS_OFFSET(PERF_REG_X86_SI, si),
+ PT_REGS_OFFSET(PERF_REG_X86_DI, di),
+ PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
+ PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
+ PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
+ PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
+ PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
+ PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
+#ifdef CONFIG_X86_32
+ PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
+ PT_REGS_OFFSET(PERF_REG_X86_ES, es),
+ PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
+ PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
+#else
+ /*
+ * The pt_regs struct does not store
+ * ds, es, fs, gs in 64 bit mode.
+ */
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+ (unsigned int) -1,
+#endif
+#ifdef CONFIG_X86_64
+ PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
+ PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
+ PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
+ PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
+ PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
+ PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
+ PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
+ PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
+#endif
+};
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+ if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
+ return 0;
+
+ return regs_get_register(regs, pt_regs_offset[idx]);
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
+
+#ifdef CONFIG_X86_32
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ return PERF_SAMPLE_REGS_ABI_32;
+}
+#else /* CONFIG_X86_64 */
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+ (1ULL << PERF_REG_X86_ES) | \
+ (1ULL << PERF_REG_X86_FS) | \
+ (1ULL << PERF_REG_X86_GS))
+
+int perf_reg_validate(u64 mask)
+{
+ if (!mask || mask & REG_RESERVED)
+ return -EINVAL;
+
+ if (mask & REG_NOSUPPORT)
+ return -EINVAL;
+
+ return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+ if (test_tsk_thread_flag(task, TIF_IA32))
+ return PERF_SAMPLE_REGS_ABI_32;
+ else
+ return PERF_SAMPLE_REGS_ABI_64;
+}
+#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dc3567e083f9..b644e1c765dc 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,71 +293,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
}
/*
- * This gets run with %si containing the
- * function to call, and %di containing
- * the "args".
- */
-extern void kernel_thread_helper(void);
-
-/*
- * Create a kernel thread
- */
-int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
-{
- struct pt_regs regs;
-
- memset(&regs, 0, sizeof(regs));
-
- regs.si = (unsigned long) fn;
- regs.di = (unsigned long) arg;
-
-#ifdef CONFIG_X86_32
- regs.ds = __USER_DS;
- regs.es = __USER_DS;
- regs.fs = __KERNEL_PERCPU;
- regs.gs = __KERNEL_STACK_CANARY;
-#else
- regs.ss = __KERNEL_DS;
-#endif
-
- regs.orig_ax = -1;
- regs.ip = (unsigned long) kernel_thread_helper;
- regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
-
- /* Ok, create the new process.. */
- return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
-}
-EXPORT_SYMBOL(kernel_thread);
-
-/*
- * sys_execve() executes a new program.
- */
-long sys_execve(const char __user *name,
- const char __user *const __user *argv,
- const char __user *const __user *envp, struct pt_regs *regs)
-{
- long error;
- char *filename;
-
- filename = getname(name);
- error = PTR_ERR(filename);
- if (IS_ERR(filename))
- return error;
- error = do_execve(filename, argv, envp, regs);
-
-#ifdef CONFIG_X86_32
- if (error == 0) {
- /* Make sure we don't return using sysenter.. */
- set_thread_flag(TIF_IRET);
- }
-#endif
-
- putname(filename);
- return error;
-}
-
-/*
* Idle related variables and functions
*/
unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b9ff83c7135b..44e0bff38e72 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,6 +57,7 @@
#include <asm/switch_to.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
/*
* Return saved PC of a blocked thread.
@@ -127,23 +128,39 @@ void release_thread(struct task_struct *dead_task)
}
int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
+ unsigned long arg,
struct task_struct *p, struct pt_regs *regs)
{
- struct pt_regs *childregs;
+ struct pt_regs *childregs = task_pt_regs(p);
struct task_struct *tsk;
int err;
- childregs = task_pt_regs(p);
+ p->thread.sp = (unsigned long) childregs;
+ p->thread.sp0 = (unsigned long) (childregs+1);
+
+ if (unlikely(!regs)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ p->thread.ip = (unsigned long) ret_from_kernel_thread;
+ task_user_gs(p) = __KERNEL_STACK_CANARY;
+ childregs->ds = __USER_DS;
+ childregs->es = __USER_DS;
+ childregs->fs = __KERNEL_PERCPU;
+ childregs->bx = sp; /* function */
+ childregs->bp = arg;
+ childregs->orig_ax = -1;
+ childregs->cs = __KERNEL_CS | get_kernel_rpl();
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+ p->fpu_counter = 0;
+ p->thread.io_bitmap_ptr = NULL;
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+ return 0;
+ }
*childregs = *regs;
childregs->ax = 0;
childregs->sp = sp;
- p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
-
p->thread.ip = (unsigned long) ret_from_fork;
-
task_user_gs(p) = get_user_gs(regs);
p->fpu_counter = 0;
@@ -190,6 +207,12 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
+ regs->flags = X86_EFLAGS_IF;
+ /*
+ * force it to the iret return path by making it look as if there was
+ * some work pending.
+ */
+ set_thread_flag(TIF_NOTIFY_RESUME);
}
EXPORT_SYMBOL_GPL(start_thread);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8a6d20ce1978..16c6365e2b86 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -146,29 +146,18 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
}
int copy_thread(unsigned long clone_flags, unsigned long sp,
- unsigned long unused,
+ unsigned long arg,
struct task_struct *p, struct pt_regs *regs)
{
int err;
struct pt_regs *childregs;
struct task_struct *me = current;
- childregs = ((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(p))) - 1;
- *childregs = *regs;
-
- childregs->ax = 0;
- if (user_mode(regs))
- childregs->sp = sp;
- else
- childregs->sp = (unsigned long)childregs;
-
+ p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
+ childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- p->thread.sp0 = (unsigned long) (childregs+1);
p->thread.usersp = me->thread.usersp;
-
set_tsk_thread_flag(p, TIF_FORK);
-
p->fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
@@ -178,6 +167,24 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
+
+ if (unlikely(!regs)) {
+ /* kernel thread */
+ memset(childregs, 0, sizeof(struct pt_regs));
+ childregs->sp = (unsigned long)childregs;
+ childregs->ss = __KERNEL_DS;
+ childregs->bx = sp; /* function */
+ childregs->bp = arg;
+ childregs->orig_ax = -1;
+ childregs->cs = __KERNEL_CS | get_kernel_rpl();
+ childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
+ return 0;
+ }
+ *childregs = *regs;
+
+ childregs->ax = 0;
+ childregs->sp = sp;
err = -ENOMEM;
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 861a9d1a463d..b00b33a18390 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,6 +21,7 @@
#include <linux/signal.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/rcupdate.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -1460,6 +1461,8 @@ long syscall_trace_enter(struct pt_regs *regs)
{
long ret = 0;
+ rcu_user_exit();
+
/*
* If we stepped into a sysenter/syscall insn, it trapped in
* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1523,4 +1526,6 @@ void syscall_trace_leave(struct pt_regs *regs)
!test_thread_flag(TIF_SYSCALL_EMU);
if (step || test_thread_flag(TIF_SYSCALL_TRACE))
tracehook_report_syscall_exit(regs, step);
+
+ rcu_user_enter();
}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index af6db6ec5b2a..4929c1be0ac0 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -225,7 +225,7 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char *ids[] __initconst =
+ static const char * const const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
struct pnp_id *id;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 198e774498ec..468e98dfd44e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -68,6 +68,7 @@
#include <linux/percpu.h>
#include <linux/crash_dump.h>
#include <linux/tboot.h>
+#include <linux/jiffies.h>
#include <video/edid.h>
@@ -970,13 +971,11 @@ void __init setup_arch(char **cmdline_p)
initmem_init();
memblock_find_dma_reserve();
-#ifdef CONFIG_KVM_CLOCK
+#ifdef CONFIG_KVM_GUEST
kvmclock_init();
#endif
- x86_init.paging.pagetable_setup_start(swapper_pg_dir);
- paging_init();
- x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ x86_init.paging.pagetable_init();
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
@@ -1047,6 +1046,8 @@ void __init setup_arch(char **cmdline_p)
mcheck_init();
arch_init_ideal_nops();
+
+ register_refined_jiffies(CLOCK_TICK_RATE);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 036bddb46236..29ad351804e9 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -816,6 +816,8 @@ static void do_signal(struct pt_regs *regs)
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
+ rcu_user_exit();
+
#ifdef CONFIG_X86_MCE
/* notify userspace of pending MCEs */
if (thread_info_flags & _TIF_MCE_NOTIFY)
@@ -838,9 +840,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
fire_user_return_notifiers();
-#ifdef CONFIG_X86_32
- clear_thread_flag(TIF_IRET);
-#endif /* CONFIG_X86_32 */
+ rcu_user_enter();
}
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7c5a8c314c02..c80a33bc528b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long boot_error = 0;
int timeout;
- alternatives_smp_switch(1);
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(idle))) - 1);
@@ -1053,20 +1054,6 @@ out:
preempt_enable();
}
-void arch_disable_nonboot_cpus_begin(void)
-{
- /*
- * Avoid the smp alternatives switch during the disable_nonboot_cpus().
- * In the suspend path, we will be back in the SMP mode shortly anyways.
- */
- skip_smp_alternatives = true;
-}
-
-void arch_disable_nonboot_cpus_end(void)
-{
- skip_smp_alternatives = false;
-}
-
void arch_enable_nonboot_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
@@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu)
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
-
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
return;
}
msleep(100);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index c346d1161488..cd3b2438a980 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,6 +157,33 @@ static int enable_single_step(struct task_struct *child)
return 1;
}
+void set_task_blockstep(struct task_struct *task, bool on)
+{
+ unsigned long debugctl;
+
+ /*
+ * Ensure irq/preemption can't change debugctl in between.
+ * Note also that both TIF_BLOCKSTEP and debugctl should
+ * be changed atomically wrt preemption.
+ * FIXME: this means that set/clear TIF_BLOCKSTEP is simply
+ * wrong if task != current, SIGKILL can wakeup the stopped
+ * tracee and set/clear can play with the running task, this
+ * can confuse the next __switch_to_xtra().
+ */
+ local_irq_disable();
+ debugctl = get_debugctlmsr();
+ if (on) {
+ debugctl |= DEBUGCTLMSR_BTF;
+ set_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ } else {
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
+ }
+ if (task == current)
+ update_debugctlmsr(debugctl);
+ local_irq_enable();
+}
+
/*
* Enable single or block step.
*/
@@ -169,19 +196,10 @@ static void enable_step(struct task_struct *child, bool block)
* So no one should try to use debugger block stepping in a program
* that uses user-mode single stepping itself.
*/
- if (enable_single_step(child) && block) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- set_tsk_thread_flag(child, TIF_BLOCKSTEP);
- } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (enable_single_step(child) && block)
+ set_task_blockstep(child, true);
+ else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
}
void user_enable_single_step(struct task_struct *child)
@@ -199,13 +217,8 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
- }
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP))
+ set_task_blockstep(child, false);
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
deleted file mode 100644
index 0b0cb5fede19..000000000000
--- a/arch/x86/kernel/sys_i386_32.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * This file contains various random system calls that
- * have a non-standard calling sequence on the Linux/i386
- * platform.
- */
-
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/smp.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/ipc.h>
-
-#include <linux/uaccess.h>
-#include <linux/unistd.h>
-
-#include <asm/syscalls.h>
-
-/*
- * Do a system call from kernel instead of calling sys_execve so we
- * end up with proper pt_regs.
- */
-int kernel_execve(const char *filename,
- const char *const argv[],
- const char *const envp[])
-{
- long __res;
- asm volatile ("int $0x80"
- : "=a" (__res)
- : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
- return __res;
-}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4f4aba0551b0..8276dc6794cc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -55,6 +55,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/mce.h>
+#include <asm/rcu.h>
#include <asm/mach_traps.h>
@@ -107,30 +108,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
dec_preempt_count();
}
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
- long error_code, siginfo_t *info)
+static int __kprobes
+do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
+ struct pt_regs *regs, long error_code)
{
- struct task_struct *tsk = current;
-
#ifdef CONFIG_X86_32
if (regs->flags & X86_VM_MASK) {
/*
- * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+ * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
*/
- if (trapnr < X86_TRAP_UD)
- goto vm86_trap;
- goto trap_signal;
+ if (trapnr < X86_TRAP_UD) {
+ if (!handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ return 0;
+ }
+ return -1;
}
#endif
+ if (!user_mode(regs)) {
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = trapnr;
+ die(str, regs, error_code);
+ }
+ return 0;
+ }
- if (!user_mode(regs))
- goto kernel_trap;
+ return -1;
+}
-#ifdef CONFIG_X86_32
-trap_signal:
-#endif
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+
+
+ if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code))
+ return;
/*
* We want error_code and trap_nr set for userspace faults and
* kernelspace faults which result in die(), but not
@@ -158,33 +174,20 @@ trap_signal:
force_sig_info(signr, info, tsk);
else
force_sig(signr, tsk);
- return;
-
-kernel_trap:
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = trapnr;
- die(str, regs, error_code);
- }
- return;
-
-#ifdef CONFIG_X86_32
-vm86_trap:
- if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
- error_code, trapnr))
- goto trap_signal;
- return;
-#endif
}
#define DO_ERROR(trapnr, signr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ exception_enter(regs); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, \
+ trapnr, signr) == NOTIFY_STOP) { \
+ exception_exit(regs); \
return; \
+ } \
conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, NULL); \
+ exception_exit(regs); \
}
#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
@@ -195,11 +198,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
info.si_errno = 0; \
info.si_code = sicode; \
info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
+ exception_enter(regs); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, \
+ trapnr, signr) == NOTIFY_STOP) { \
+ exception_exit(regs); \
return; \
+ } \
conditional_sti(regs); \
do_trap(trapnr, signr, str, regs, error_code, &info); \
+ exception_exit(regs); \
}
DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -222,12 +229,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
/* Runs on IST stack */
dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
- X86_TRAP_SS, SIGBUS) == NOTIFY_STOP)
- return;
- preempt_conditional_sti(regs);
- do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
- preempt_conditional_cli(regs);
+ X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
+ preempt_conditional_sti(regs);
+ do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
+ preempt_conditional_cli(regs);
+ }
+ exception_exit(regs);
}
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -235,6 +244,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
static const char str[] = "double fault";
struct task_struct *tsk = current;
+ exception_enter(regs);
/* Return not checked because double check cannot be ignored */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -255,16 +265,29 @@ do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
+ exception_enter(regs);
conditional_sti(regs);
#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK)
- goto gp_in_vm86;
+ if (regs->flags & X86_VM_MASK) {
+ local_irq_enable();
+ handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+ goto exit;
+ }
#endif
tsk = current;
- if (!user_mode(regs))
- goto gp_in_kernel;
+ if (!user_mode(regs)) {
+ if (fixup_exception(regs))
+ goto exit;
+
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_nr = X86_TRAP_GP;
+ if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
+ X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
+ die("general protection fault", regs, error_code);
+ goto exit;
+ }
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -279,25 +302,8 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
force_sig(SIGSEGV, tsk);
- return;
-
-#ifdef CONFIG_X86_32
-gp_in_vm86:
- local_irq_enable();
- handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- return;
-#endif
-
-gp_in_kernel:
- if (fixup_exception(regs))
- return;
-
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_GP;
- if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
- X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP)
- return;
- die("general protection fault", regs, error_code);
+exit:
+ exception_exit(regs);
}
/* May run on IST stack. */
@@ -312,15 +318,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
ftrace_int3_handler(regs))
return;
#endif
+ exception_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -331,6 +338,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
+exit:
+ exception_exit(regs);
}
#ifdef CONFIG_X86_64
@@ -391,6 +400,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
+ exception_enter(regs);
+
get_debugreg(dr6, 6);
/* Filter out all the reserved bits which are preset to 1 */
@@ -406,7 +417,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- return;
+ goto exit;
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
@@ -421,7 +432,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
SIGTRAP) == NOTIFY_STOP)
- return;
+ goto exit;
/*
* Let others (NMI) know that the debug stack is in use
@@ -437,7 +448,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
X86_TRAP_DB);
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+ goto exit;
}
/*
@@ -458,7 +469,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
- return;
+exit:
+ exception_exit(regs);
}
/*
@@ -555,14 +567,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
ignore_fpu_irq = 1;
#endif
-
+ exception_enter(regs);
math_error(regs, error_code, X86_TRAP_MF);
+ exception_exit(regs);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
math_error(regs, error_code, X86_TRAP_XF);
+ exception_exit(regs);
}
dotraplinkage void
@@ -630,6 +645,7 @@ EXPORT_SYMBOL_GPL(math_state_restore);
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ exception_enter(regs);
BUG_ON(use_eager_fpu());
#ifdef CONFIG_MATH_EMULATION
@@ -640,6 +656,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
+ exception_exit(regs);
return;
}
#endif
@@ -647,12 +664,15 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
+ exception_exit(regs);
}
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
+
+ exception_enter(regs);
local_irq_enable();
info.si_signo = SIGILL;
@@ -660,10 +680,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
info.si_code = ILL_BADSTK;
info.si_addr = NULL;
if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
- X86_TRAP_IRET, SIGILL) == NOTIFY_STOP)
- return;
- do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
- &info);
+ X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
+ do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
+ &info);
+ }
+ exception_exit(regs);
}
#endif
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 36fd42091fa7..9538f00827a9 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -41,6 +41,9 @@
/* Adjust the return address of a call insn */
#define UPROBE_FIX_CALL 0x2
+/* Instruction will modify TF, don't change it */
+#define UPROBE_FIX_SETF 0x4
+
#define UPROBE_FIX_RIP_AX 0x8000
#define UPROBE_FIX_RIP_CX 0x4000
@@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
insn_get_opcode(insn); /* should be a nop */
switch (OPCODE1(insn)) {
+ case 0x9d:
+ /* popf */
+ auprobe->fixups |= UPROBE_FIX_SETF;
+ break;
case 0xc3: /* ret/lret */
case 0xcb:
case 0xc2:
@@ -646,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
* Skip these instructions as per the currently known x86 ISA.
* 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }
*/
-bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
int i;
@@ -673,3 +680,46 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
}
return false;
}
+
+bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
+{
+ bool ret = __skip_sstep(auprobe, regs);
+ if (ret && (regs->flags & X86_EFLAGS_TF))
+ send_sig(SIGTRAP, current, 0);
+ return ret;
+}
+
+void arch_uprobe_enable_step(struct arch_uprobe *auprobe)
+{
+ struct task_struct *task = current;
+ struct arch_uprobe_task *autask = &task->utask->autask;
+ struct pt_regs *regs = task_pt_regs(task);
+
+ autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
+
+ regs->flags |= X86_EFLAGS_TF;
+ if (test_tsk_thread_flag(task, TIF_BLOCKSTEP))
+ set_task_blockstep(task, false);
+}
+
+void arch_uprobe_disable_step(struct arch_uprobe *auprobe)
+{
+ struct task_struct *task = current;
+ struct arch_uprobe_task *autask = &task->utask->autask;
+ bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED);
+ struct pt_regs *regs = task_pt_regs(task);
+ /*
+ * The state of TIF_BLOCKSTEP was not saved so we can get an extra
+ * SIGTRAP if we do not clear TF. We need to examine the opcode to
+ * make it right.
+ */
+ if (unlikely(trapped)) {
+ if (!autask->saved_tf)
+ regs->flags &= ~X86_EFLAGS_TF;
+ } else {
+ if (autask->saved_tf)
+ send_sig(SIGTRAP, task, 0);
+ else if (!(auprobe->fixups & UPROBE_FIX_SETF))
+ regs->flags &= ~X86_EFLAGS_TF;
+ }
+}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 54abcc0baf23..5c9687b1bde6 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -561,9 +561,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
if ((trapno == 3) || (trapno == 1)) {
KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
/* setting this flag forces the code in entry_32.S to
- call save_v86_state() and change the stack pointer
- to KVM86->regs32 */
- set_thread_flag(TIF_IRET);
+ the path where we call save_v86_state() and change
+ the stack pointer to KVM86->regs32 */
+ set_thread_flag(TIF_NOTIFY_RESUME);
return 0;
}
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8d141b309046..3a3e8c9e280d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -28,7 +28,7 @@
#include <linux/jiffies.h>
#include <linux/sysctl.h>
#include <linux/topology.h>
-#include <linux/clocksource.h>
+#include <linux/timekeeper_internal.h>
#include <linux/getcpu.h>
#include <linux/cpu.h>
#include <linux/smp.h>
@@ -82,32 +82,41 @@ void update_vsyscall_tz(void)
vsyscall_gtod_data.sys_tz = sys_tz;
}
-void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
- struct clocksource *clock, u32 mult)
+void update_vsyscall(struct timekeeper *tk)
{
- struct timespec monotonic;
+ struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data;
- write_seqcount_begin(&vsyscall_gtod_data.seq);
+ write_seqcount_begin(&vdata->seq);
/* copy vsyscall data */
- vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
- vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
- vsyscall_gtod_data.clock.mask = clock->mask;
- vsyscall_gtod_data.clock.mult = mult;
- vsyscall_gtod_data.clock.shift = clock->shift;
-
- vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
- vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+ vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->clock->cycle_last;
+ vdata->clock.mask = tk->clock->mask;
+ vdata->clock.mult = tk->mult;
+ vdata->clock.shift = tk->shift;
+
+ vdata->wall_time_sec = tk->xtime_sec;
+ vdata->wall_time_snsec = tk->xtime_nsec;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->xtime_nsec
+ + (tk->wall_to_monotonic.tv_nsec
+ << tk->shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->shift;
+ vdata->monotonic_time_sec++;
+ }
- monotonic = timespec_add(*wall_time, *wtm);
- vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec;
- vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec;
+ vdata->wall_time_coarse.tv_sec = tk->xtime_sec;
+ vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
- vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
- vsyscall_gtod_data.monotonic_time_coarse =
- timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm);
+ vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse,
+ tk->wall_to_monotonic);
- write_seqcount_end(&vsyscall_gtod_data.seq);
+ write_seqcount_end(&vdata->seq);
}
static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 6020f6f5927c..1330dd102950 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,9 +13,13 @@
#include <asm/ftrace.h>
#ifdef CONFIG_FUNCTION_TRACER
-/* mcount is defined in assembly */
+/* mcount and __fentry__ are defined in assembly */
+#ifdef CC_USING_FENTRY
+EXPORT_SYMBOL(__fentry__);
+#else
EXPORT_SYMBOL(mcount);
#endif
+#endif
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 9f3167e891ef..7a3d075a814a 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -26,7 +26,6 @@
void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
-void __init x86_init_pgd_noop(pgd_t *unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
@@ -68,8 +67,7 @@ struct x86_init_ops x86_init __initdata = {
},
.paging = {
- .pagetable_setup_start = native_pagetable_setup_start,
- .pagetable_setup_done = native_pagetable_setup_done,
+ .pagetable_init = native_pagetable_init,
},
.timers = {
OpenPOWER on IntegriCloud