summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c9
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c16
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c3
-rw-r--r--arch/x86/kernel/fpu/init.c1
-rw-r--r--arch/x86/kernel/ftrace.c20
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c4
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c4
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/tsc.c206
-rw-r--r--arch/x86/kernel/unwind_frame.c49
23 files changed, 175 insertions, 199 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..3c7c419c4e3e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c5b8f760473c..32e14d137416 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -409,8 +409,13 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
memcpy(insnbuf, replacement, a->replacementlen);
insnbuf_sz = a->replacementlen;
- /* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ /*
+ * 0xe8 is a relative jump; fix the offset.
+ *
+ * Instruction length is checked before the opcode to avoid
+ * accessing uninitialized bytes for zero-length replacements.
+ */
+ if (a->replacementlen == 5 && *insnbuf == 0xe8) {
*(s32 *)(insnbuf + 1) += replacement - instr;
DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
*(s32 *)(insnbuf + 1),
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index a70fd61095f8..6f077445647a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -255,6 +255,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
break;
case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
+ case 11: /* GX1 with inverted Device ID */
#ifdef CONFIG_PCI
{
u32 vendor, device;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 5abd4bf73d6e..5cfbaeb6529a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -499,16 +499,14 @@ static int mce_usable_address(struct mce *m)
return 1;
}
-static bool memory_error(struct mce *m)
+bool mce_is_memory_error(struct mce *m)
{
- struct cpuinfo_x86 *c = &boot_cpu_data;
-
- if (c->x86_vendor == X86_VENDOR_AMD) {
+ if (m->cpuvendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
- } else if (c->x86_vendor == X86_VENDOR_INTEL) {
+ } else if (m->cpuvendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
@@ -529,6 +527,7 @@ static bool memory_error(struct mce *m)
return false;
}
+EXPORT_SYMBOL_GPL(mce_is_memory_error);
static bool cec_add_mce(struct mce *m)
{
@@ -536,7 +535,7 @@ static bool cec_add_mce(struct mce *m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
- if (memory_error(m) &&
+ if (mce_is_memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
@@ -713,7 +712,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
- if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
+ if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m))
if (m.status & MCI_STATUS_ADDRV)
m.severity = severity;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 45db4d2ebd01..e9f4d762aa5b 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -320,7 +320,7 @@ void load_ucode_amd_ap(unsigned int cpuid_1_eax)
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
{
@@ -338,8 +338,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax)
if (!desc.mc)
return -EINVAL;
- ret = load_microcode_amd(smp_processor_id(), x86_family(cpuid_1_eax),
- desc.data, desc.size);
+ ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size);
if (ret != UCODE_OK)
return -EINVAL;
@@ -675,7 +674,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
}
static enum ucode_state
-load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
@@ -689,8 +688,8 @@ load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
#ifdef CONFIG_X86_32
/* save BSP's matching patch for early load */
- if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
- struct ucode_patch *p = find_patch(cpu);
+ if (save) {
+ struct ucode_patch *p = find_patch(0);
if (p) {
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -722,11 +721,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
{
char fw_name[36] = "amd-ucode/microcode_amd.bin";
struct cpuinfo_x86 *c = &cpu_data(cpu);
+ bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
enum ucode_state ret = UCODE_NFOUND;
const struct firmware *fw;
/* reload ucode container only on the boot cpu */
- if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+ if (!refresh_fw || !bsp)
return UCODE_OK;
if (c->x86 >= 0x15)
@@ -743,7 +743,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
goto fw_release;
}
- ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+ ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
fw_release:
release_firmware(fw);
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index afdfd237b59f..f522415bf9e5 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -619,6 +619,9 @@ int __init save_microcode_in_initrd_intel(void)
show_saved_mc();
+ /* initrd is going away, clear patch ptr. */
+ intel_ucode_patch = NULL;
+
return 0;
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index c2f8dde3255c..d5d44c452624 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -90,6 +90,7 @@ static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
* Boot time FPU feature detection code:
*/
unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+EXPORT_SYMBOL_GPL(mxcsr_feature_mask);
static void __init fpu__init_system_mxcsr(void)
{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 0651e974dcb3..9bef1bbeba63 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -689,8 +689,12 @@ static inline void *alloc_tramp(unsigned long size)
{
return module_alloc(size);
}
-static inline void tramp_free(void *tramp)
+static inline void tramp_free(void *tramp, int size)
{
+ int npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+ set_memory_nx((unsigned long)tramp, npages);
+ set_memory_rw((unsigned long)tramp, npages);
module_memfree(tramp);
}
#else
@@ -699,7 +703,7 @@ static inline void *alloc_tramp(unsigned long size)
{
return NULL;
}
-static inline void tramp_free(void *tramp) { }
+static inline void tramp_free(void *tramp, int size) { }
#endif
/* Defined as markers to the end of the ftrace default trampolines */
@@ -771,7 +775,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Copy ftrace_caller onto the trampoline memory */
ret = probe_kernel_read(trampoline, (void *)start_offset, size);
if (WARN_ON(ret < 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -797,7 +801,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* Are we pointing to the reference? */
if (WARN_ON(memcmp(op_ptr.op, op_ref, 3) != 0)) {
- tramp_free(trampoline);
+ tramp_free(trampoline, *tramp_size);
return 0;
}
@@ -839,7 +843,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret;
+ int ret, npages;
if (ops->trampoline) {
/*
@@ -848,11 +852,14 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
*/
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
+ npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
+ set_memory_rw(ops->trampoline, npages);
} else {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
+ npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
}
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
@@ -863,6 +870,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
ret = update_ftrace_func(ip, new);
+ set_memory_ro(ops->trampoline, npages);
/* The update should never fail */
WARN_ON(ret);
@@ -939,7 +947,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
return;
- tramp_free((void *)ops->trampoline);
+ tramp_free((void *)ops->trampoline, ops->trampoline_size);
ops->trampoline = 0;
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 5b2bbfbb3712..6b877807598b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -52,6 +52,7 @@
#include <linux/ftrace.h>
#include <linux/frame.h>
#include <linux/kasan.h>
+#include <linux/moduleloader.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -417,6 +418,14 @@ static void prepare_boost(struct kprobe *p, struct insn *insn)
}
}
+/* Recover page to RW mode before releasing it */
+void free_insn_page(void *page)
+{
+ set_memory_nx((unsigned long)page & PAGE_MASK, 1);
+ set_memory_rw((unsigned long)page & PAGE_MASK, 1);
+ module_memfree(page);
+}
+
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index da5c09789984..43e10d6fdbed 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -161,8 +161,8 @@ void kvm_async_pf_task_wait(u32 token)
*/
rcu_irq_exit();
native_safe_halt();
- rcu_irq_enter();
local_irq_disable();
+ rcu_irq_enter();
}
}
if (!n.halted)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ff40e74c9181..ffeae818aa7a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -78,7 +78,7 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
- smp_processor_id());
+ raw_smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0b4d3c686b1e..f81823695014 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -980,8 +980,6 @@ void __init setup_arch(char **cmdline_p)
*/
x86_configure_nx();
- simple_udelay_calibration();
-
parse_early_param();
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1041,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ simple_udelay_calibration();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..045e4f993bd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 207b8f2582c7..213ddf3e937d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -144,7 +144,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (end - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -187,7 +187,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vma->vm_start))
+ (!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 3995d3a777d4..bf54309b85da 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -182,7 +182,7 @@ int is_valid_bugaddr(unsigned long addr)
return ud == INSN_UD0 || ud == INSN_UD2;
}
-static int fixup_bug(struct pt_regs *regs, int trapnr)
+int fixup_bug(struct pt_regs *regs, int trapnr)
{
if (trapnr != X86_TRAP_UD)
return 0;
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 82c6d7f1fd73..b9389d72b2f7 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -104,6 +104,11 @@ static inline unsigned long *last_frame(struct unwind_state *state)
return (unsigned long *)task_pt_regs(state->task) - 2;
}
+static bool is_last_frame(struct unwind_state *state)
+{
+ return state->bp == last_frame(state);
+}
+
#ifdef CONFIG_X86_32
#define GCC_REALIGN_WORDS 3
#else
@@ -115,16 +120,15 @@ static inline unsigned long *last_aligned_frame(struct unwind_state *state)
return last_frame(state) - GCC_REALIGN_WORDS;
}
-static bool is_last_task_frame(struct unwind_state *state)
+static bool is_last_aligned_frame(struct unwind_state *state)
{
unsigned long *last_bp = last_frame(state);
unsigned long *aligned_bp = last_aligned_frame(state);
/*
- * We have to check for the last task frame at two different locations
- * because gcc can occasionally decide to realign the stack pointer and
- * change the offset of the stack frame in the prologue of a function
- * called by head/entry code. Examples:
+ * GCC can occasionally decide to realign the stack pointer and change
+ * the offset of the stack frame in the prologue of a function called
+ * by head/entry code. Examples:
*
* <start_secondary>:
* push %edi
@@ -141,11 +145,38 @@ static bool is_last_task_frame(struct unwind_state *state)
* push %rbp
* mov %rsp,%rbp
*
- * Note that after aligning the stack, it pushes a duplicate copy of
- * the return address before pushing the frame pointer.
+ * After aligning the stack, it pushes a duplicate copy of the return
+ * address before pushing the frame pointer.
+ */
+ return (state->bp == aligned_bp && *(aligned_bp + 1) == *(last_bp + 1));
+}
+
+static bool is_last_ftrace_frame(struct unwind_state *state)
+{
+ unsigned long *last_bp = last_frame(state);
+ unsigned long *last_ftrace_bp = last_bp - 3;
+
+ /*
+ * When unwinding from an ftrace handler of a function called by entry
+ * code, the stack layout of the last frame is:
+ *
+ * bp
+ * parent ret addr
+ * bp
+ * function ret addr
+ * parent ret addr
+ * pt_regs
+ * -----------------
*/
- return (state->bp == last_bp ||
- (state->bp == aligned_bp && *(aligned_bp+1) == *(last_bp+1)));
+ return (state->bp == last_ftrace_bp &&
+ *state->bp == *(state->bp + 2) &&
+ *(state->bp + 1) == *(state->bp + 4));
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ return is_last_frame(state) || is_last_aligned_frame(state) ||
+ is_last_ftrace_frame(state);
}
/*
OpenPOWER on IntegriCloud